xref: /petsc/src/mat/impls/h2opus/cuda/math2opusutils.cu (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
1 #include <petsc/private/matimpl.h>
2 #include <petsc/private/vecimpl.h>
3 #include <petscsf.h>
4 #if defined(PETSC_HAVE_CUDA)
5   #include <thrust/for_each.h>
6   #include <thrust/device_vector.h>
7   #include <thrust/execution_policy.h>
8 #endif
9 
10 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF sf, PetscInt nv, PetscInt ldr, PetscInt ldl, PetscSF *vsf)
11 {
12   PetscSF            rankssf;
13   const PetscSFNode *iremote;
14   PetscSFNode       *viremote, *rremotes;
15   const PetscInt    *ilocal;
16   PetscInt          *vilocal = NULL, *ldrs;
17   const PetscMPIInt *ranks;
18   PetscMPIInt       *sranks;
19   PetscInt           nranks, nr, nl, vnr, vnl, i, v, j, maxl;
20   MPI_Comm           comm;
21 
22   PetscFunctionBegin;
23   PetscValidHeaderSpecific(sf, PETSCSF_CLASSID, 1);
24   PetscValidLogicalCollectiveInt(sf, nv, 2);
25   PetscValidPointer(vsf, 5);
26   if (nv == 1) {
27     PetscCall(PetscObjectReference((PetscObject)sf));
28     *vsf = sf;
29     PetscFunctionReturn(0);
30   }
31   PetscCall(PetscObjectGetComm((PetscObject)sf, &comm));
32   PetscCall(PetscSFGetGraph(sf, &nr, &nl, &ilocal, &iremote));
33   PetscCall(PetscSFGetLeafRange(sf, NULL, &maxl));
34   maxl += 1;
35   if (ldl == PETSC_DECIDE) ldl = maxl;
36   if (ldr == PETSC_DECIDE) ldr = nr;
37   PetscCheck(ldr >= nr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid leading dimension %" PetscInt_FMT " < %" PetscInt_FMT, ldr, nr);
38   PetscCheck(ldl >= maxl, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Invalid leading dimension %" PetscInt_FMT " < %" PetscInt_FMT, ldl, maxl);
39   vnr = nr * nv;
40   vnl = nl * nv;
41   PetscCall(PetscMalloc1(vnl, &viremote));
42   if (ilocal) PetscCall(PetscMalloc1(vnl, &vilocal));
43 
44   /* TODO: Should this special SF be available, e.g.
45      PetscSFGetRanksSF or similar? */
46   PetscCall(PetscSFGetRootRanks(sf, &nranks, &ranks, NULL, NULL, NULL));
47   PetscCall(PetscMalloc1(nranks, &sranks));
48   PetscCall(PetscArraycpy(sranks, ranks, nranks));
49   PetscCall(PetscSortMPIInt(nranks, sranks));
50   PetscCall(PetscMalloc1(nranks, &rremotes));
51   for (i = 0; i < nranks; i++) {
52     rremotes[i].rank  = sranks[i];
53     rremotes[i].index = 0;
54   }
55   PetscCall(PetscSFDuplicate(sf, PETSCSF_DUPLICATE_CONFONLY, &rankssf));
56   PetscCall(PetscSFSetGraph(rankssf, 1, nranks, NULL, PETSC_OWN_POINTER, rremotes, PETSC_OWN_POINTER));
57   PetscCall(PetscMalloc1(nranks, &ldrs));
58   PetscCall(PetscSFBcastBegin(rankssf, MPIU_INT, &ldr, ldrs, MPI_REPLACE));
59   PetscCall(PetscSFBcastEnd(rankssf, MPIU_INT, &ldr, ldrs, MPI_REPLACE));
60   PetscCall(PetscSFDestroy(&rankssf));
61 
62   j = -1;
63   for (i = 0; i < nl; i++) {
64     const PetscInt r  = iremote[i].rank;
65     const PetscInt ii = iremote[i].index;
66 
67     if (j < 0 || sranks[j] != r) PetscCall(PetscFindMPIInt(r, nranks, sranks, &j));
68     PetscCheck(j >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unable to locate neighbor rank %" PetscInt_FMT, r);
69     for (v = 0; v < nv; v++) {
70       viremote[v * nl + i].rank  = r;
71       viremote[v * nl + i].index = v * ldrs[j] + ii;
72       if (ilocal) vilocal[v * nl + i] = v * ldl + ilocal[i];
73     }
74   }
75   PetscCall(PetscFree(sranks));
76   PetscCall(PetscFree(ldrs));
77   PetscCall(PetscSFCreate(comm, vsf));
78   PetscCall(PetscSFSetGraph(*vsf, vnr, vnl, vilocal, PETSC_OWN_POINTER, viremote, PETSC_OWN_POINTER));
79   PetscFunctionReturn(0);
80 }
81 
82 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat A, PetscSF h2sf, PetscSF *osf)
83 {
84   PetscSF asf;
85 
86   PetscFunctionBegin;
87   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
88   PetscValidHeaderSpecific(h2sf, PETSCSF_CLASSID, 2);
89   PetscValidPointer(osf, 3);
90   PetscCall(PetscObjectQuery((PetscObject)A, "_math2opus_vectorsf", (PetscObject *)&asf));
91   if (!asf) {
92     PetscInt lda;
93 
94     PetscCall(MatDenseGetLDA(A, &lda));
95     PetscCall(PetscSFGetVectorSF(h2sf, A->cmap->N, lda, PETSC_DECIDE, &asf));
96     PetscCall(PetscObjectCompose((PetscObject)A, "_math2opus_vectorsf", (PetscObject)asf));
97     PetscCall(PetscObjectDereference((PetscObject)asf));
98   }
99   *osf = asf;
100   PetscFunctionReturn(0);
101 }
102 
103 #if defined(PETSC_HAVE_CUDA)
104 struct SignVector_Functor {
105   const PetscScalar *v;
106   PetscScalar       *s;
107   SignVector_Functor(const PetscScalar *_v, PetscScalar *_s) : v(_v), s(_s) { }
108 
109   __host__ __device__ void operator()(PetscInt i) { s[i] = (v[i] < 0 ? -1 : 1); }
110 };
111 #endif
112 
113 PETSC_INTERN PetscErrorCode VecSign(Vec v, Vec s)
114 {
115   const PetscScalar *av;
116   PetscScalar       *as;
117   PetscInt           i, n;
118 #if defined(PETSC_HAVE_CUDA)
119   PetscBool viscuda, siscuda;
120 #endif
121 
122   PetscFunctionBegin;
123   PetscValidHeaderSpecific(v, VEC_CLASSID, 1);
124   PetscValidHeaderSpecific(s, VEC_CLASSID, 2);
125   PetscCall(VecGetLocalSize(s, &n));
126   PetscCall(VecGetLocalSize(v, &i));
127   PetscCheck(i == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "Invalid local sizes %" PetscInt_FMT " != %" PetscInt_FMT, i, n);
128 #if defined(PETSC_HAVE_CUDA)
129   PetscCall(PetscObjectTypeCompareAny((PetscObject)v, &viscuda, VECSEQCUDA, VECMPICUDA, ""));
130   PetscCall(PetscObjectTypeCompareAny((PetscObject)s, &siscuda, VECSEQCUDA, VECMPICUDA, ""));
131   viscuda = (PetscBool)(viscuda && !v->boundtocpu);
132   siscuda = (PetscBool)(siscuda && !s->boundtocpu);
133   if (viscuda && siscuda) {
134     PetscCall(VecCUDAGetArrayRead(v, &av));
135     PetscCall(VecCUDAGetArrayWrite(s, &as));
136     SignVector_Functor sign_vector(av, as);
137     thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(n), sign_vector);
138     PetscCall(VecCUDARestoreArrayWrite(s, &as));
139     PetscCall(VecCUDARestoreArrayRead(v, &av));
140   } else
141 #endif
142   {
143     PetscCall(VecGetArrayRead(v, &av));
144     PetscCall(VecGetArrayWrite(s, &as));
145     for (i = 0; i < n; i++) as[i] = PetscAbsScalar(av[i]) < 0 ? -1. : 1.;
146     PetscCall(VecRestoreArrayWrite(s, &as));
147     PetscCall(VecRestoreArrayRead(v, &av));
148   }
149   PetscFunctionReturn(0);
150 }
151 
152 #if defined(PETSC_HAVE_CUDA)
153 struct StandardBasis_Functor {
154   PetscScalar *v;
155   PetscInt     j;
156 
157   StandardBasis_Functor(PetscScalar *_v, PetscInt _j) : v(_v), j(_j) { }
158   __host__ __device__ void operator()(PetscInt i) { v[i] = (i == j ? 1 : 0); }
159 };
160 #endif
161 
162 PETSC_INTERN PetscErrorCode VecSetDelta(Vec x, PetscInt i)
163 {
164 #if defined(PETSC_HAVE_CUDA)
165   PetscBool iscuda;
166 #endif
167   PetscInt st, en;
168 
169   PetscFunctionBegin;
170   PetscCall(VecGetOwnershipRange(x, &st, &en));
171 #if defined(PETSC_HAVE_CUDA)
172   PetscCall(PetscObjectTypeCompareAny((PetscObject)x, &iscuda, VECSEQCUDA, VECMPICUDA, ""));
173   iscuda = (PetscBool)(iscuda && !x->boundtocpu);
174   if (iscuda) {
175     PetscScalar *ax;
176     PetscCall(VecCUDAGetArrayWrite(x, &ax));
177     StandardBasis_Functor delta(ax, i - st);
178     thrust::for_each(thrust::device, thrust::counting_iterator<PetscInt>(0), thrust::counting_iterator<PetscInt>(en - st), delta);
179     PetscCall(VecCUDARestoreArrayWrite(x, &ax));
180   } else
181 #endif
182   {
183     PetscCall(VecSet(x, 0.));
184     if (st <= i && i < en) PetscCall(VecSetValue(x, i, 1.0, INSERT_VALUES));
185     PetscCall(VecAssemblyBegin(x));
186     PetscCall(VecAssemblyEnd(x));
187   }
188   PetscFunctionReturn(0);
189 }
190 
191 /* these are approximate norms */
192 /* NORM_2: Estimating the matrix p-norm Nicholas J. Higham
193    NORM_1/NORM_INFINITY: A block algorithm for matrix 1-norm estimation, with an application to 1-norm pseudospectra Higham, Nicholas J. and Tisseur, Francoise */
194 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat A, NormType normtype, PetscInt normsamples, PetscReal *n)
195 {
196   Vec         x, y, w, z;
197   PetscReal   normz, adot;
198   PetscScalar dot;
199   PetscInt    i, j, N, jold = -1;
200   PetscBool   boundtocpu = PETSC_TRUE;
201 
202   PetscFunctionBegin;
203 #if defined(PETSC_HAVE_DEVICE)
204   boundtocpu = A->boundtocpu;
205 #endif
206   switch (normtype) {
207   case NORM_INFINITY:
208   case NORM_1:
209     if (normsamples < 0) normsamples = 10; /* pure guess */
210     if (normtype == NORM_INFINITY) {
211       Mat B;
212       PetscCall(MatCreateTranspose(A, &B));
213       A = B;
214     } else {
215       PetscCall(PetscObjectReference((PetscObject)A));
216     }
217     PetscCall(MatCreateVecs(A, &x, &y));
218     PetscCall(MatCreateVecs(A, &z, &w));
219     PetscCall(VecBindToCPU(x, boundtocpu));
220     PetscCall(VecBindToCPU(y, boundtocpu));
221     PetscCall(VecBindToCPU(z, boundtocpu));
222     PetscCall(VecBindToCPU(w, boundtocpu));
223     PetscCall(VecGetSize(x, &N));
224     PetscCall(VecSet(x, 1. / N));
225     *n = 0.0;
226     for (i = 0; i < normsamples; i++) {
227       PetscCall(MatMult(A, x, y));
228       PetscCall(VecSign(y, w));
229       PetscCall(MatMultTranspose(A, w, z));
230       PetscCall(VecNorm(z, NORM_INFINITY, &normz));
231       PetscCall(VecDot(x, z, &dot));
232       adot = PetscAbsScalar(dot);
233       PetscCall(PetscInfo(A, "%s norm it %" PetscInt_FMT " -> (%g %g)\n", NormTypes[normtype], i, (double)normz, (double)adot));
234       if (normz <= adot && i > 0) {
235         PetscCall(VecNorm(y, NORM_1, n));
236         break;
237       }
238       PetscCall(VecMax(z, &j, &normz));
239       if (j == jold) {
240         PetscCall(VecNorm(y, NORM_1, n));
241         PetscCall(PetscInfo(A, "%s norm it %" PetscInt_FMT " -> breakdown (j==jold)\n", NormTypes[normtype], i));
242         break;
243       }
244       jold = j;
245       PetscCall(VecSetDelta(x, j));
246     }
247     PetscCall(MatDestroy(&A));
248     PetscCall(VecDestroy(&x));
249     PetscCall(VecDestroy(&w));
250     PetscCall(VecDestroy(&y));
251     PetscCall(VecDestroy(&z));
252     break;
253   case NORM_2:
254     if (normsamples < 0) normsamples = 20; /* pure guess */
255     PetscCall(MatCreateVecs(A, &x, &y));
256     PetscCall(MatCreateVecs(A, &z, NULL));
257     PetscCall(VecBindToCPU(x, boundtocpu));
258     PetscCall(VecBindToCPU(y, boundtocpu));
259     PetscCall(VecBindToCPU(z, boundtocpu));
260     PetscCall(VecSetRandom(x, NULL));
261     PetscCall(VecNormalize(x, NULL));
262     *n = 0.0;
263     for (i = 0; i < normsamples; i++) {
264       PetscCall(MatMult(A, x, y));
265       PetscCall(VecNormalize(y, n));
266       PetscCall(MatMultTranspose(A, y, z));
267       PetscCall(VecNorm(z, NORM_2, &normz));
268       PetscCall(VecDot(x, z, &dot));
269       adot = PetscAbsScalar(dot);
270       PetscCall(PetscInfo(A, "%s norm it %" PetscInt_FMT " -> %g (%g %g)\n", NormTypes[normtype], i, (double)*n, (double)normz, (double)adot));
271       if (normz <= adot) break;
272       if (i < normsamples - 1) {
273         Vec t;
274 
275         PetscCall(VecNormalize(z, NULL));
276         t = x;
277         x = z;
278         z = t;
279       }
280     }
281     PetscCall(VecDestroy(&x));
282     PetscCall(VecDestroy(&y));
283     PetscCall(VecDestroy(&z));
284     break;
285   default:
286     SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "%s norm not supported", NormTypes[normtype]);
287   }
288   PetscCall(PetscInfo(A, "%s norm %g computed in %" PetscInt_FMT " iterations\n", NormTypes[normtype], (double)*n, i));
289   PetscFunctionReturn(0);
290 }
291