xref: /petsc/src/mat/impls/h2opus/cuda/math2opusutils.cu (revision 9371c9d470a9602b6d10a8bf50c9b2280a79e45a)
1 #include <petsc/private/matimpl.h>
2 #include <petsc/private/vecimpl.h>
3 #include <petscsf.h>
4 #if defined(PETSC_HAVE_CUDA)
5 #include <thrust/for_each.h>
6 #include <thrust/device_vector.h>
7 #include <thrust/execution_policy.h>
8 #endif
9 
10 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF sf, PetscInt nv, PetscInt ldr, PetscInt ldl, PetscSF *vsf) {
11   PetscSF            rankssf;
12   const PetscSFNode *iremote;
13   PetscSFNode       *viremote, *rremotes;
14   const PetscInt    *ilocal;
15   PetscInt          *vilocal = NULL, *ldrs;
16   const PetscMPIInt *ranks;
17   PetscMPIInt       *sranks;
18   PetscInt           nranks, nr, nl, vnr, vnl, i, v, j, maxl;
19   MPI_Comm           comm;
20 
21   PetscFunctionBegin;
22   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
23   PetscValidLogicalCollectiveInt(sf, nv, 2);
24   PetscValidPointer(vsf, 5);
25   if (nv == 1) {
26     PetscCall(PetscObjectReference((PetscObject)sf));
27     *vsf = sf;
28     PetscFunctionReturn(0);
29   }
30   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
31   PetscCall(PetscSFGetGraph(sf, &nr, &nl, &ilocal, &iremote));
32   PetscCall(PetscSFGetLeafRange(sf, NULL, &maxl));
33   maxl += 1;
34   if (ldl == PETSC_DECIDE) ldl = maxl;
35   if (ldr == PETSC_DECIDE) ldr = nr;
36   PetscCheck(ldr >= nr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid leading dimension %" PetscInt_FMT " < %" PetscInt_FMT, ldr, nr);
37   PetscCheck(ldl >= maxl, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid leading dimension %" PetscInt_FMT " < %" PetscInt_FMT, ldl, maxl);
38   vnr = nr * nv;
39   vnl = nl * nv;
40   PetscCall(PetscMalloc1(vnl, &viremote));
41   if (ilocal) PetscCall(PetscMalloc1(vnl, &vilocal));
42 
43   /* TODO: Should this special SF be available, e.g.
44      PetscSFGetRanksSF or similar? */
45   PetscCall(PetscSFGetRootRanks(sf, &nranks, &ranks, NULL, NULL, NULL));
46   PetscCall(PetscMalloc1(nranks, &sranks));
47   PetscCall(PetscArraycpy(sranks, ranks, nranks));
48   PetscCall(PetscSortMPIInt(nranks, sranks));
49   PetscCall(PetscMalloc1(nranks, &rremotes));
50   for (i = 0; i < nranks; i++) {
51     rremotes[i].rank  = sranks[i];
52     rremotes[i].index = 0;
53   }
54   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, &rankssf));
55   PetscCall(PetscSFSetGraph(rankssf, 1, nranks, NULL, PETSC_OWN_POINTER, rremotes, PETSC_OWN_POINTER));
56   PetscCall(PetscMalloc1(nranks, &ldrs));
57   PetscCall(PetscSFBcastBegin(rankssf, MPIU_INT, &ldr, ldrs, MPI_REPLACE));
58   PetscCall(PetscSFBcastEnd(rankssf, MPIU_INT, &ldr, ldrs, MPI_REPLACE));
59   PetscCall(PetscSFDestroy(&rankssf));
60 
61   j = -1;
62   for (i = 0; i < nl; i++) {
63     const PetscInt r  = iremote[i].rank;
64     const PetscInt ii = iremote[i].index;
65 
66     if (j < 0 || sranks[j] != r) { PetscCall(PetscFindMPIInt(r, nranks, sranks, &j)); }
67     PetscCheck(j >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unable to locate neighbor rank %" PetscInt_FMT, r);
68     for (v = 0; v < nv; v++) {
69       viremote[v * nl + i].rank  = r;
70       viremote[v * nl + i].index = v * ldrs[j] + ii;
71       if (ilocal) vilocal[v * nl + i] = v * ldl + ilocal[i];
72     }
73   }
74   PetscCall(PetscFree(sranks));
75   PetscCall(PetscFree(ldrs));
76   PetscCall(PetscSFCreate(comm, vsf));
77   PetscCall(PetscSFSetGraph(*vsf, vnr, vnl, vilocal, PETSC_OWN_POINTER, viremote, PETSC_OWN_POINTER));
78   PetscFunctionReturn(0);
79 }
80 
81 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat A, PetscSF h2sf, PetscSF *osf) {
82   PetscSF asf;
83 
84   PetscFunctionBegin;
85   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
86   PetscValidHeaderSpecific(h2sf, PETSCSF_CLASSID, 2);
87   PetscValidPointer(osf, 3);
88   PetscCall(PetscObjectQuery((PetscObject)A, "_math2opus_vectorsf", (PetscObject *)&asf));
89   if (!asf) {
90     PetscInt lda;
91 
92     PetscCall(MatDenseGetLDA(A, &lda));
93     PetscCall(PetscSFGetVectorSF(h2sf, A->cmap->N, lda, PETSC_DECIDE, &asf));
94     PetscCall(PetscObjectCompose((PetscObject)A, "_math2opus_vectorsf", (PetscObject)asf));
95     PetscCall(PetscObjectDereference((PetscObject)asf));
96   }
97   *osf = asf;
98   PetscFunctionReturn(0);
99 }
100 
101 #if defined(PETSC_HAVE_CUDA)
102 struct SignVector_Functor {
103   const PetscScalar *v;
104   PetscScalar       *s;
105   SignVector_Functor(const PetscScalar *_v, PetscScalar *_s) : v(_v), s(_s) { }
106 
107   __host__ __device__ void operator()(PetscInt i) { s[i] = (v[i] < 0 ? -1 : 1); }
108 };
109 #endif
110 
111 PETSC_INTERN PetscErrorCode VecSign(Vec v, Vec s) {
112   const PetscScalar *av;
113   PetscScalar       *as;
114   PetscInt           i, n;
115 #if defined(PETSC_HAVE_CUDA)
116   PetscBool viscuda, siscuda;
117 #endif
118 
119   PetscFunctionBegin;
120   PetscValidHeaderSpecific(v, VEC_CLASSID, 1);
121   PetscValidHeaderSpecific(s, VEC_CLASSID, 2);
122   PetscCall(VecGetLocalSize(s, &n));
123   PetscCall(VecGetLocalSize(v, &i));
124   PetscCheck(i == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Invalid local sizes %" PetscInt_FMT " != %" PetscInt_FMT, i, n);
125 #if defined(PETSC_HAVE_CUDA)
126   PetscCall(PetscObjectTypeCompareAny((PetscObject)v, &viscuda, VECSEQCUDA, VECMPICUDA, ""));
127   PetscCall(PetscObjectTypeCompareAny((PetscObject)s, &siscuda, VECSEQCUDA, VECMPICUDA, ""));
128   viscuda = (PetscBool)(viscuda && !v->boundtocpu);
129   siscuda = (PetscBool)(siscuda && !s->boundtocpu);
130   if (viscuda && siscuda) {
131     PetscCall(VecCUDAGetArrayRead(v, &av));
132     PetscCall(VecCUDAGetArrayWrite(s, &as));
133     SignVector_Functor sign_vector(av, as);
134     thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(n), sign_vector);
135     PetscCall(VecCUDARestoreArrayWrite(s, &as));
136     PetscCall(VecCUDARestoreArrayRead(v, &av));
137   } else
138 #endif
139   {
140     PetscCall(VecGetArrayRead(v, &av));
141     PetscCall(VecGetArrayWrite(s, &as));
142     for (i = 0; i < n; i++) as[i] = PetscAbsScalar(av[i]) < 0 ? -1. : 1.;
143     PetscCall(VecRestoreArrayWrite(s, &as));
144     PetscCall(VecRestoreArrayRead(v, &av));
145   }
146   PetscFunctionReturn(0);
147 }
148 
149 #if defined(PETSC_HAVE_CUDA)
150 struct StandardBasis_Functor {
151   PetscScalar *v;
152   PetscInt     j;
153 
154   StandardBasis_Functor(PetscScalar *_v, PetscInt _j) : v(_v), j(_j) { }
155   __host__ __device__ void operator()(PetscInt i) { v[i] = (i == j ? 1 : 0); }
156 };
157 #endif
158 
159 PETSC_INTERN PetscErrorCode VecSetDelta(Vec x, PetscInt i) {
160 #if defined(PETSC_HAVE_CUDA)
161   PetscBool iscuda;
162 #endif
163   PetscInt st, en;
164 
165   PetscFunctionBegin;
166   PetscCall(VecGetOwnershipRange(x, &st, &en));
167 #if defined(PETSC_HAVE_CUDA)
168   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &iscuda, VECSEQCUDA, VECMPICUDA, ""));
169   iscuda = (PetscBool)(iscuda && !x->boundtocpu);
170   if (iscuda) {
171     PetscScalar *ax;
172     PetscCall(VecCUDAGetArrayWrite(x, &ax));
173     StandardBasis_Functor delta(ax, i - st);
174     thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(en - st), delta);
175     PetscCall(VecCUDARestoreArrayWrite(x, &ax));
176   } else
177 #endif
178   {
179     PetscCall(VecSet(x, 0.));
180     if (st <= i && i < en) { PetscCall(VecSetValue(x, i, 1.0, INSERT_VALUES)); }
181     PetscCall(VecAssemblyBegin(x));
182     PetscCall(VecAssemblyEnd(x));
183   }
184   PetscFunctionReturn(0);
185 }
186 
187 /* these are approximate norms */
188 /* NORM_2: Estimating the matrix p-norm Nicholas J. Higham
189    NORM_1/NORM_INFINITY: A block algorithm for matrix 1-norm estimation, with an application to 1-norm pseudospectra Higham, Nicholas J. and Tisseur, Francoise */
190 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat A, NormType normtype, PetscInt normsamples, PetscReal *n) {
191   Vec         x, y, w, z;
192   PetscReal   normz, adot;
193   PetscScalar dot;
194   PetscInt    i, j, N, jold = -1;
195   PetscBool   boundtocpu = PETSC_TRUE;
196 
197   PetscFunctionBegin;
198 #if defined(PETSC_HAVE_DEVICE)
199   boundtocpu = A->boundtocpu;
200 #endif
201   switch (normtype) {
202   case NORM_INFINITY:
203   case NORM_1:
204     if (normsamples < 0) normsamples = 10; /* pure guess */
205     if (normtype == NORM_INFINITY) {
206       Mat B;
207       PetscCall(MatCreateTranspose(A, &B));
208       A = B;
209     } else {
210       PetscCall(PetscObjectReference((PetscObject)A));
211     }
212     PetscCall(MatCreateVecs(A, &x, &y));
213     PetscCall(MatCreateVecs(A, &z, &w));
214     PetscCall(VecBindToCPU(x, boundtocpu));
215     PetscCall(VecBindToCPU(y, boundtocpu));
216     PetscCall(VecBindToCPU(z, boundtocpu));
217     PetscCall(VecBindToCPU(w, boundtocpu));
218     PetscCall(VecGetSize(x, &N));
219     PetscCall(VecSet(x, 1. / N));
220     *n = 0.0;
221     for (i = 0; i < normsamples; i++) {
222       PetscCall(MatMult(A, x, y));
223       PetscCall(VecSign(y, w));
224       PetscCall(MatMultTranspose(A, w, z));
225       PetscCall(VecNorm(z, NORM_INFINITY, &normz));
226       PetscCall(VecDot(x, z, &dot));
227       adot = PetscAbsScalar(dot);
228       PetscCall(PetscInfo(A, "%s norm it %" PetscInt_FMT " -> (%g %g)\n", NormTypes[normtype], i, (double)normz, (double)adot));
229       if (normz <= adot && i > 0) {
230         PetscCall(VecNorm(y, NORM_1, n));
231         break;
232       }
233       PetscCall(VecMax(z, &j, &normz));
234       if (j == jold) {
235         PetscCall(VecNorm(y, NORM_1, n));
236         PetscCall(PetscInfo(A, "%s norm it %" PetscInt_FMT " -> breakdown (j==jold)\n", NormTypes[normtype], i));
237         break;
238       }
239       jold = j;
240       PetscCall(VecSetDelta(x, j));
241     }
242     PetscCall(MatDestroy(&A));
243     PetscCall(VecDestroy(&x));
244     PetscCall(VecDestroy(&w));
245     PetscCall(VecDestroy(&y));
246     PetscCall(VecDestroy(&z));
247     break;
248   case NORM_2:
249     if (normsamples < 0) normsamples = 20; /* pure guess */
250     PetscCall(MatCreateVecs(A, &x, &y));
251     PetscCall(MatCreateVecs(A, &z, NULL));
252     PetscCall(VecBindToCPU(x, boundtocpu));
253     PetscCall(VecBindToCPU(y, boundtocpu));
254     PetscCall(VecBindToCPU(z, boundtocpu));
255     PetscCall(VecSetRandom(x, NULL));
256     PetscCall(VecNormalize(x, NULL));
257     *n = 0.0;
258     for (i = 0; i < normsamples; i++) {
259       PetscCall(MatMult(A, x, y));
260       PetscCall(VecNormalize(y, n));
261       PetscCall(MatMultTranspose(A, y, z));
262       PetscCall(VecNorm(z, NORM_2, &normz));
263       PetscCall(VecDot(x, z, &dot));
264       adot = PetscAbsScalar(dot);
265       PetscCall(PetscInfo(A, "%s norm it %" PetscInt_FMT " -> %g (%g %g)\n", NormTypes[normtype], i, (double)*n, (double)normz, (double)adot));
266       if (normz <= adot) break;
267       if (i < normsamples - 1) {
268         Vec t;
269 
270         PetscCall(VecNormalize(z, NULL));
271         t = x;
272         x = z;
273         z = t;
274       }
275     }
276     PetscCall(VecDestroy(&x));
277     PetscCall(VecDestroy(&y));
278     PetscCall(VecDestroy(&z));
279     break;
280   default: SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "%s norm not supported", NormTypes[normtype]);
281   }
282   PetscCall(PetscInfo(A, "%s norm %g computed in %" PetscInt_FMT " iterations\n", NormTypes[normtype], (double)*n, i));
283   PetscFunctionReturn(0);
284 }
285