xref: /petsc/src/mat/impls/h2opus/cuda/math2opusutils.cu (revision 76be6f4ff3bd4e251c19fc00ebbebfd58b6e7589)
1 #include <petsc/private/matimpl.h>
2 #include <petsc/private/vecimpl.h>
3 #include <petscsf.h>
4 #if defined(PETSC_HAVE_CUDA)
5 #include <thrust/for_each.h>
6 #include <thrust/device_vector.h>
7 #include <thrust/execution_policy.h>
8 #endif
9 
10 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF sf, PetscInt nv, PetscInt ldr, PetscInt ldl, PetscSF *vsf)
11 {
12   PetscSF           rankssf;
13   const PetscSFNode *iremote;
14   PetscSFNode       *viremote,*rremotes;
15   const PetscInt    *ilocal;
16   PetscInt          *vilocal = NULL,*ldrs;
17   const PetscMPIInt *ranks;
18   PetscMPIInt       *sranks;
19   PetscInt          nranks,nr,nl,vnr,vnl,i,v,j,maxl;
20   MPI_Comm          comm;
21 
22   PetscFunctionBegin;
23   PetscValidHeaderSpecific(sf,PETSCSF_CLASSID,1);
24   PetscValidLogicalCollectiveInt(sf,nv,2);
25   PetscValidPointer(vsf,5);
26   if (nv == 1) {
27     PetscCall(PetscObjectReference((PetscObject)sf));
28     *vsf = sf;
29     PetscFunctionReturn(0);
30   }
31   PetscCall(PetscObjectGetComm((PetscObject)sf,&comm));
32   PetscCall(PetscSFGetGraph(sf,&nr,&nl,&ilocal,&iremote));
33   PetscCall(PetscSFGetLeafRange(sf,NULL,&maxl));
34   maxl += 1;
35   if (ldl == PETSC_DECIDE) ldl = maxl;
36   if (ldr == PETSC_DECIDE) ldr = nr;
37   PetscCheck(ldr >= nr,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Invalid leading dimension %" PetscInt_FMT " < %" PetscInt_FMT,ldr,nr);
38   PetscCheck(ldl >= maxl,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Invalid leading dimension %" PetscInt_FMT " < %" PetscInt_FMT,ldl,maxl);
39   vnr  = nr*nv;
40   vnl  = nl*nv;
41   PetscCall(PetscMalloc1(vnl,&viremote));
42   if (ilocal) PetscCall(PetscMalloc1(vnl,&vilocal));
43 
44   /* TODO: Should this special SF be available, e.g.
45      PetscSFGetRanksSF or similar? */
46   PetscCall(PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL));
47   PetscCall(PetscMalloc1(nranks,&sranks));
48   PetscCall(PetscArraycpy(sranks,ranks,nranks));
49   PetscCall(PetscSortMPIInt(nranks,sranks));
50   PetscCall(PetscMalloc1(nranks,&rremotes));
51   for (i=0;i<nranks;i++) {
52     rremotes[i].rank  = sranks[i];
53     rremotes[i].index = 0;
54   }
55   PetscCall(PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&rankssf));
56   PetscCall(PetscSFSetGraph(rankssf,1,nranks,NULL,PETSC_OWN_POINTER,rremotes,PETSC_OWN_POINTER));
57   PetscCall(PetscMalloc1(nranks,&ldrs));
58   PetscCall(PetscSFBcastBegin(rankssf,MPIU_INT,&ldr,ldrs,MPI_REPLACE));
59   PetscCall(PetscSFBcastEnd(rankssf,MPIU_INT,&ldr,ldrs,MPI_REPLACE));
60   PetscCall(PetscSFDestroy(&rankssf));
61 
62   j = -1;
63   for (i=0;i<nl;i++) {
64     const PetscInt r  = iremote[i].rank;
65     const PetscInt ii = iremote[i].index;
66 
67     if (j < 0 || sranks[j] != r) {
68       PetscCall(PetscFindMPIInt(r,nranks,sranks,&j));
69     }
70     PetscCheck(j >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unable to locate neighbor rank %" PetscInt_FMT,r);
71     for (v=0;v<nv;v++) {
72       viremote[v*nl + i].rank  = r;
73       viremote[v*nl + i].index = v*ldrs[j] + ii;
74       if (ilocal) vilocal[v*nl + i] = v*ldl + ilocal[i];
75     }
76   }
77   PetscCall(PetscFree(sranks));
78   PetscCall(PetscFree(ldrs));
79   PetscCall(PetscSFCreate(comm,vsf));
80   PetscCall(PetscSFSetGraph(*vsf,vnr,vnl,vilocal,PETSC_OWN_POINTER,viremote,PETSC_OWN_POINTER));
81   PetscFunctionReturn(0);
82 }
83 
84 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat A, PetscSF h2sf, PetscSF *osf)
85 {
86   PetscSF asf;
87 
88   PetscFunctionBegin;
89   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
90   PetscValidHeaderSpecific(h2sf,PETSCSF_CLASSID,2);
91   PetscValidPointer(osf,3);
92   PetscCall(PetscObjectQuery((PetscObject)A,"_math2opus_vectorsf",(PetscObject*)&asf));
93   if (!asf) {
94     PetscInt lda;
95 
96     PetscCall(MatDenseGetLDA(A,&lda));
97     PetscCall(PetscSFGetVectorSF(h2sf,A->cmap->N,lda,PETSC_DECIDE,&asf));
98     PetscCall(PetscObjectCompose((PetscObject)A,"_math2opus_vectorsf",(PetscObject)asf));
99     PetscCall(PetscObjectDereference((PetscObject)asf));
100   }
101   *osf = asf;
102   PetscFunctionReturn(0);
103 }
104 
105 #if defined(PETSC_HAVE_CUDA)
106 struct SignVector_Functor
107 {
108     const PetscScalar *v;
109     PetscScalar *s;
110     SignVector_Functor(const PetscScalar *_v, PetscScalar *_s) : v(_v), s(_s) {}
111 
112     __host__ __device__ void operator()(PetscInt i)
113     {
114         s[i] = (v[i] < 0 ? -1 : 1);
115     }
116 };
117 #endif
118 
119 PETSC_INTERN PetscErrorCode VecSign(Vec v, Vec s)
120 {
121   const PetscScalar *av;
122   PetscScalar       *as;
123   PetscInt          i,n;
124 #if defined(PETSC_HAVE_CUDA)
125   PetscBool         viscuda,siscuda;
126 #endif
127 
128   PetscFunctionBegin;
129   PetscValidHeaderSpecific(v,VEC_CLASSID,1);
130   PetscValidHeaderSpecific(s,VEC_CLASSID,2);
131   PetscCall(VecGetLocalSize(s,&n));
132   PetscCall(VecGetLocalSize(v,&i));
133   PetscCheck(i == n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Invalid local sizes %" PetscInt_FMT " != %" PetscInt_FMT,i,n);
134 #if defined(PETSC_HAVE_CUDA)
135   PetscCall(PetscObjectTypeCompareAny((PetscObject)v,&viscuda,VECSEQCUDA,VECMPICUDA,""));
136   PetscCall(PetscObjectTypeCompareAny((PetscObject)s,&siscuda,VECSEQCUDA,VECMPICUDA,""));
137   viscuda = (PetscBool)(viscuda && !v->boundtocpu);
138   siscuda = (PetscBool)(siscuda && !s->boundtocpu);
139   if (viscuda && siscuda) {
140     PetscCall(VecCUDAGetArrayRead(v,&av));
141     PetscCall(VecCUDAGetArrayWrite(s,&as));
142     SignVector_Functor sign_vector(av, as);
143     thrust::for_each(thrust::device,thrust::counting_iterator<PetscInt>(0),
144                      thrust::counting_iterator<PetscInt>(n), sign_vector);
145     PetscCall(VecCUDARestoreArrayWrite(s,&as));
146     PetscCall(VecCUDARestoreArrayRead(v,&av));
147   } else
148 #endif
149   {
150     PetscCall(VecGetArrayRead(v,&av));
151     PetscCall(VecGetArrayWrite(s,&as));
152     for (i=0;i<n;i++) as[i] = PetscAbsScalar(av[i]) < 0 ? -1. : 1.;
153     PetscCall(VecRestoreArrayWrite(s,&as));
154     PetscCall(VecRestoreArrayRead(v,&av));
155   }
156   PetscFunctionReturn(0);
157 }
158 
159 #if defined(PETSC_HAVE_CUDA)
160 struct StandardBasis_Functor
161 {
162     PetscScalar *v;
163     PetscInt j;
164 
165     StandardBasis_Functor(PetscScalar *_v, PetscInt _j) : v(_v), j(_j) {}
166     __host__ __device__ void operator()(PetscInt i)
167     {
168         v[i] = (i == j ? 1 : 0);
169     }
170 };
171 #endif
172 
173 PETSC_INTERN PetscErrorCode VecSetDelta(Vec x, PetscInt i)
174 {
175 #if defined(PETSC_HAVE_CUDA)
176   PetscBool iscuda;
177 #endif
178   PetscInt  st,en;
179 
180   PetscFunctionBegin;
181   PetscCall(VecGetOwnershipRange(x,&st,&en));
182 #if defined(PETSC_HAVE_CUDA)
183   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&iscuda,VECSEQCUDA,VECMPICUDA,""));
184   iscuda = (PetscBool)(iscuda && !x->boundtocpu);
185   if (iscuda) {
186     PetscScalar *ax;
187     PetscCall(VecCUDAGetArrayWrite(x,&ax));
188     StandardBasis_Functor delta(ax, i-st);
189     thrust::for_each(thrust::device,thrust::counting_iterator<PetscInt>(0),
190                      thrust::counting_iterator<PetscInt>(en-st), delta);
191     PetscCall(VecCUDARestoreArrayWrite(x,&ax));
192   } else
193 #endif
194   {
195     PetscCall(VecSet(x,0.));
196     if (st <= i && i < en) {
197       PetscCall(VecSetValue(x,i,1.0,INSERT_VALUES));
198     }
199     PetscCall(VecAssemblyBegin(x));
200     PetscCall(VecAssemblyEnd(x));
201   }
202   PetscFunctionReturn(0);
203 }
204 
205 /* these are approximate norms */
206 /* NORM_2: Estimating the matrix p-norm Nicholas J. Higham
207    NORM_1/NORM_INFINITY: A block algorithm for matrix 1-norm estimation, with an application to 1-norm pseudospectra Higham, Nicholas J. and Tisseur, Francoise */
208 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat A, NormType normtype, PetscInt normsamples, PetscReal* n)
209 {
210   Vec         x,y,w,z;
211   PetscReal   normz,adot;
212   PetscScalar dot;
213   PetscInt    i,j,N,jold = -1;
214   PetscBool   boundtocpu = PETSC_TRUE;
215 
216   PetscFunctionBegin;
217 #if defined(PETSC_HAVE_DEVICE)
218   boundtocpu = A->boundtocpu;
219 #endif
220   switch (normtype) {
221   case NORM_INFINITY:
222   case NORM_1:
223     if (normsamples < 0) normsamples = 10; /* pure guess */
224     if (normtype == NORM_INFINITY) {
225       Mat B;
226       PetscCall(MatCreateTranspose(A,&B));
227       A = B;
228     } else {
229       PetscCall(PetscObjectReference((PetscObject)A));
230     }
231     PetscCall(MatCreateVecs(A,&x,&y));
232     PetscCall(MatCreateVecs(A,&z,&w));
233     PetscCall(VecBindToCPU(x,boundtocpu));
234     PetscCall(VecBindToCPU(y,boundtocpu));
235     PetscCall(VecBindToCPU(z,boundtocpu));
236     PetscCall(VecBindToCPU(w,boundtocpu));
237     PetscCall(VecGetSize(x,&N));
238     PetscCall(VecSet(x,1./N));
239     *n   = 0.0;
240     for (i = 0; i < normsamples; i++) {
241       PetscCall(MatMult(A,x,y));
242       PetscCall(VecSign(y,w));
243       PetscCall(MatMultTranspose(A,w,z));
244       PetscCall(VecNorm(z,NORM_INFINITY,&normz));
245       PetscCall(VecDot(x,z,&dot));
246       adot = PetscAbsScalar(dot);
247       PetscCall(PetscInfo(A,"%s norm it %" PetscInt_FMT " -> (%g %g)\n",NormTypes[normtype],i,(double)normz,(double)adot));
248       if (normz <= adot && i > 0) {
249         PetscCall(VecNorm(y,NORM_1,n));
250         break;
251       }
252       PetscCall(VecMax(z,&j,&normz));
253       if (j == jold) {
254         PetscCall(VecNorm(y,NORM_1,n));
255         PetscCall(PetscInfo(A,"%s norm it %" PetscInt_FMT " -> breakdown (j==jold)\n",NormTypes[normtype],i));
256         break;
257       }
258       jold = j;
259       PetscCall(VecSetDelta(x,j));
260     }
261     PetscCall(MatDestroy(&A));
262     PetscCall(VecDestroy(&x));
263     PetscCall(VecDestroy(&w));
264     PetscCall(VecDestroy(&y));
265     PetscCall(VecDestroy(&z));
266     break;
267   case NORM_2:
268     if (normsamples < 0) normsamples = 20; /* pure guess */
269     PetscCall(MatCreateVecs(A,&x,&y));
270     PetscCall(MatCreateVecs(A,&z,NULL));
271     PetscCall(VecBindToCPU(x,boundtocpu));
272     PetscCall(VecBindToCPU(y,boundtocpu));
273     PetscCall(VecBindToCPU(z,boundtocpu));
274     PetscCall(VecSetRandom(x,NULL));
275     PetscCall(VecNormalize(x,NULL));
276     *n   = 0.0;
277     for (i = 0; i < normsamples; i++) {
278       PetscCall(MatMult(A,x,y));
279       PetscCall(VecNormalize(y,n));
280       PetscCall(MatMultTranspose(A,y,z));
281       PetscCall(VecNorm(z,NORM_2,&normz));
282       PetscCall(VecDot(x,z,&dot));
283       adot = PetscAbsScalar(dot);
284       PetscCall(PetscInfo(A,"%s norm it %" PetscInt_FMT " -> %g (%g %g)\n",NormTypes[normtype],i,(double)*n,(double)normz,(double)adot));
285       if (normz <= adot) break;
286       if (i < normsamples - 1) {
287         Vec t;
288 
289         PetscCall(VecNormalize(z,NULL));
290         t = x;
291         x = z;
292         z = t;
293       }
294     }
295     PetscCall(VecDestroy(&x));
296     PetscCall(VecDestroy(&y));
297     PetscCall(VecDestroy(&z));
298     break;
299   default:
300     SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"%s norm not supported",NormTypes[normtype]);
301   }
302   PetscCall(PetscInfo(A,"%s norm %g computed in %" PetscInt_FMT " iterations\n",NormTypes[normtype],(double)*n,i));
303   PetscFunctionReturn(0);
304 }
305