xref: /libCEED/examples/nek/bps/bps.usr (revision d36c159f42e9222d4587decadab914cf55859f56)
1C Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2C the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3C reserved. See files LICENSE and NOTICE for details.
4C
5C This file is part of CEED, a collection of benchmarks, miniapps, software
6C libraries and APIs for efficient high-order finite element and spectral
7C element discretizations for exascale applications. For more information and
8C source code availability see http://github.com/ceed.
9C
10C The CEED research is supported by the Exascale Computing Project (17-SC-20-SC)
11C a collaborative effort of two U.S. Department of Energy organizations (Office
12C of Science and the National Nuclear Security Administration) responsible for
13C the planning and preparation of a capable exascale ecosystem, including
14C software, applications, hardware, advanced system engineering and early
15C testbed platforms, in support of the nation's exascale computing imperative.
16
17C> @file
18C> Mass and diffusion operators examples using Nek5000
19C TESTARGS -c {ceed_resource} -e bp1 -n 1 -b 4 -test
20
21C-----------------------------------------------------------------------
22      subroutine masssetupf(ctx,q,u1,u2,u3,u4,u5,u6,u7,
23     $  u8,u9,u10,u11,u12,u13,u14,u15,u16,v1,v2,v3,v4,v5,v6,v7,v8,
24     $  v9,v10,v11,v12,v13,v14,v15,v16,ierr)
25C     Set up mass operator
26C     Input: u1,u2,u3,q             Output: v1,v2,ierr
27      integer q,ierr
28      real*8 ctx(1)
29      real*8 u1(3*q)
30      real*8 u2(9*q)
31      real*8 u3(q)
32      real*8 v1(q)
33      real*8 v2(q)
34      real*8 a11,a12,a13,a21,a22,a23,a31,a32,a33
35      real*8 g11,g12,g13,g21,g22,g23,g31,g32,g33
36      real*8 jacmq
37
38C     Quadrature Point Loop
39      do i=1,q
40        a11=u2(i+q*0)
41        a12=u2(i+q*3)
42        a13=u2(i+q*6)
43
44        a21=u2(i+q*1)
45        a22=u2(i+q*4)
46        a23=u2(i+q*7)
47
48        a31=u2(i+q*2)
49        a32=u2(i+q*5)
50        a33=u2(i+q*8)
51
52        g11 = (a22*a33-a23*a32)
53        g12 = (a13*a32-a33*a12)
54        g13 = (a12*a23-a22*a13)
55
56        g21 = (a23*a31-a21*a33)
57        g22 = (a11*a33-a31*a13)
58        g23 = (a13*a21-a23*a11)
59
60        g31 = (a21*a32-a22*a31)
61        g32 = (a12*a31-a32*a11)
62        g33 = (a11*a22-a21*a12)
63
64        jacmq = a11*g11+a21*g12+a31*g13
65
66C       Rho
67        v1(i)=u3(i)*jacmq
68
69C       RHS
70        v2(i)=u3(i)*jacmq
71     $             *dsqrt(u1(i+q*0)*u1(i+q*0)
72     $                   +u1(i+q*1)*u1(i+q*1)
73     $                   +u1(i+q*2)*u1(i+q*2))
74      enddo
75
76      ierr=0
77      end
78C-----------------------------------------------------------------------
79      subroutine massf(ctx,q,u1,u2,u3,u4,u5,u6,u7,
80     $  u8,u9,u10,u11,u12,u13,u14,u15,u16,v1,v2,v3,v4,v5,v6,v7,v8,
81     $  v9,v10,v11,v12,v13,v14,v15,v16,ierr)
82C     Apply mass operator
83C     Input: u1,u2,q                Output: v1,ierr
84      integer q,ierr
85      real*8 ctx(1)
86      real*8 u1(q)
87      real*8 u2(q)
88      real*8 v1(q)
89
90C     Quadrature Point Loop
91      do i=1,q
92        v1(i)=u2(i)*u1(i)
93      enddo
94
95      ierr=0
96      end
97C-----------------------------------------------------------------------
98      subroutine diffsetupf(ctx,q,u1,u2,u3,u4,u5,u6,u7,
99     $  u8,u9,u10,u11,u12,u13,u14,u15,u16,v1,v2,v3,v4,v5,v6,v7,v8,
100     $  v9,v10,v11,v12,v13,v14,v15,v16,ierr)
101C     Set up diffusion operator
102C     Input: u1,u2,u3,q             Output: v1,v2,ierr
103      integer q,ierr
104      real*8 ctx(1)
105      real*8 u1(3*q)
106      real*8 u2(9*q)
107      real*8 u3(q)
108      real*8 v1(6*q)
109      real*8 v2(q)
110      real*8 a11,a12,a13,a21,a22,a23,a31,a32,a33
111      real*8 g11,g12,g13,g21,g22,g23,g31,g32,g33
112      real*8 jacmq,scl
113      real*8 c(3),k(3)
114
115C     Quadrature Point Loop
116      do i=1,q
117        pi = 3.14159265358979323846
118
119        c(1)=0.
120        c(2)=1.
121        c(3)=2.
122        k(1)=1.
123        k(2)=2.
124        k(3)=3.
125
126        a11=u2(i+q*0)
127        a12=u2(i+q*3)
128        a13=u2(i+q*6)
129
130        a21=u2(i+q*1)
131        a22=u2(i+q*4)
132        a23=u2(i+q*7)
133
134        a31=u2(i+q*2)
135        a32=u2(i+q*5)
136        a33=u2(i+q*8)
137
138        g11 = (a22*a33-a23*a32)
139        g12 = (a13*a32-a33*a12)
140        g13 = (a12*a23-a22*a13)
141
142        g21 = (a23*a31-a21*a33)
143        g22 = (a11*a33-a31*a13)
144        g23 = (a13*a21-a23*a11)
145
146        g31 = (a21*a32-a22*a31)
147        g32 = (a12*a31-a32*a11)
148        g33 = (a11*a22-a21*a12)
149
150        jacmq = a11*g11+a21*g12+a31*g13
151
152        scl = u3(i)/jacmq
153
154C       Geometric factors
155C       Stored in Voigt convention
156C       0 5 4
157C       5 1 3
158C       4 3 2
159        v1(i+0*q) = scl*(g11*g11+g12*g12+g13*g13) ! Grr
160        v1(i+1*q) = scl*(g21*g21+g22*g22+g23*g23) ! Gss
161        v1(i+2*q) = scl*(g31*g31+g32*g32+g33*g33) ! Gtt
162        v1(i+3*q) = scl*(g21*g31+g22*g32+g23*g33) ! Gst
163        v1(i+4*q) = scl*(g11*g31+g12*g32+g13*g33) ! Grt
164        v1(i+5*q) = scl*(g11*g21+g12*g22+g13*g23) ! Grs
165
166C       RHS
167        v2(i) = u3(i)*jacmq*pi*pi
168     $            *dsin(pi*(c(1)+k(1)*u1(i+0*q)))
169     $            *dsin(pi*(c(2)+k(2)*u1(i+1*q)))
170     $            *dsin(pi*(c(3)+k(3)*u1(i+2*q)))
171     $            *(k(1)*k(1)+k(2)*k(2)+k(3)*k(3))
172
173      enddo
174
175      ierr=0
176      end
177C-----------------------------------------------------------------------
178      subroutine diffusionf(ctx,q,u1,u2,u3,u4,u5,u6,u7,
179     $  u8,u9,u10,u11,u12,u13,u14,u15,u16,v1,v2,v3,v4,v5,v6,v7,v8,
180     $  v9,v10,v11,v12,v13,v14,v15,v16,ierr)
181C     Apply diffusion operator
182C     Input: u1,u2,q                Output: v1,ierr
183      integer q,ierr
184      real*8 ctx(1)
185      real*8 u1(3*q)
186      real*8 u2(6*q)
187      real*8 v1(3*q)
188
189C     Quadrature Point Loop
190      do i=1,q
191        v1(i+0*q)=
192     $     u2(i+0*q)*u1(i)+u2(i+5*q)*u1(i+q)+u2(i+4*q)*u1(i+2*q)
193        v1(i+1*q)=
194     $     u2(i+5*q)*u1(i)+u2(i+1*q)*u1(i+q)+u2(i+3*q)*u1(i+2*q)
195        v1(i+2*q)=
196     $     u2(i+4*q)*u1(i)+u2(i+3*q)*u1(i+q)+u2(i+2*q)*u1(i+2*q)
197      enddo
198
199      ierr=0
200      end
201C-----------------------------------------------------------------------
202      subroutine set_h2_as_rhoJac_GL(h2,bmq,nxq)
203C     Set h2 as rhoJac
204C     Input: bmq,nxq                Output: h2
205      include 'SIZE'
206      real*8 h2(1),bmq(1)
207
208      common /ctmp77/ wd(lxd),zd(lxd)
209      integer e,i,L
210
211      call zwgl(zd,wd,nxq)  ! nxq = number of points
212
213      q = 1.0               ! Later, this can be a function of position...
214
215      L = 0
216      do e=1,lelt
217      do i=1,nxq**ldim
218         L=L+1
219         h2(L) = q*bmq(L)
220      enddo
221      enddo
222
223      return
224      end
225C-----------------------------------------------------------------------
226      subroutine dist_fld_h1(e)
227C     Set distance initial condition for BP1
228C     Input:                        Output: e
229      include 'SIZE'
230      include 'TOTAL'
231      real*8 x,y,z
232      real*8 e(1)
233
234      n=lx1*ly1*lz1*nelt
235
236      do i=1,n
237        x=xm1(i,1,1,1)
238        y=ym1(i,1,1,1)
239        z=zm1(i,1,1,1)
240
241        e(i) = dsqrt(x*x+y*y+z*z)
242      enddo
243
244      call dsavg(e)  ! This is requisite for random fields
245
246      return
247      end
248C-----------------------------------------------------------------------
249      subroutine sin_fld_h1(e)
250C     Set sine initial condition for BP3
251C     Input:                        Output: e
252      include 'SIZE'
253      include 'TOTAL'
254      real*8 x,y,z
255      real*8 e(1)
256      real*8 c(3),k(3)
257
258      n=lx1*ly1*lz1*nelt
259      pi = 3.14159265358979323846
260
261      c(1)=0.
262      c(2)=1.
263      c(3)=2.
264      k(1)=1.
265      k(2)=2.
266      k(3)=3.
267
268      do i=1,n
269        x=xm1(i,1,1,1)
270        y=ym1(i,1,1,1)
271        z=zm1(i,1,1,1)
272
273        e(i) = dsin(pi*(c(1)+k(1)*x))
274     &        *dsin(pi*(c(2)+k(2)*y))
275     &        *dsin(pi*(c(3)+k(3)*z))
276
277      enddo
278
279      call dsavg(e)  ! This is requisite for random fields
280
281      return
282      end
283C-----------------------------------------------------------------------
284      subroutine uservp(ix,iy,iz,eg) ! set variable properties
285      include 'SIZE'
286      include 'TOTAL'
287      include 'NEKUSE'
288      integer e,f,eg
289C     e = gllel(eg)
290
291      udiff  = 0.0
292      utrans = 0.0
293
294      return
295      end
296C-----------------------------------------------------------------------
297      subroutine userf(ix,iy,iz,eg) ! set acceleration term
298C
299C     Note: this is an acceleration term, NOT a force!
300C     Thus, ffx will subsequently be multiplied by rho(x,t).
301C
302      include 'SIZE'
303      include 'TOTAL'
304      include 'NEKUSE'
305      integer e,f,eg
306C     e = gllel(eg)
307
308      ffx = 0.0
309      ffy = 0.0
310      ffz = 0.0
311
312      return
313      end
314C-----------------------------------------------------------------------
315      subroutine userq(i,j,k,eg) ! set source term
316      include 'SIZE'
317      include 'TOTAL'
318      include 'NEKUSE'
319      integer e,f,eg
320      e = gllel(eg)
321
322      qvol   = 0
323
324      return
325      end
326C-----------------------------------------------------------------------
327      subroutine userbc(ix,iy,iz,f,eg) ! set up boundary conditions
328C     NOTE ::: This subroutine MAY NOT be called by every process
329      include 'SIZE'
330      include 'TOTAL'
331      include 'NEKUSE'
332      integer e,f,eg
333
334      ux   = 0.0
335      uy   = 0.0
336      uz   = 0.0
337      temp = 0.0
338
339      return
340      end
341C-----------------------------------------------------------------------
342      subroutine useric(ix,iy,iz,eg) ! set up initial conditions
343      include 'SIZE'
344      include 'TOTAL'
345      include 'NEKUSE'
346      integer e,f,eg
347
348      ux   = 0.0
349      uy   = 0.0
350      uz   = 0.0
351      temp = 0.0
352
353      return
354      end
355C-----------------------------------------------------------------------
356      subroutine usrdat   ! This routine to modify element vertices
357      include 'SIZE'
358      include 'TOTAL'
359
360      return
361      end
362C-----------------------------------------------------------------------
363      subroutine usrdat2  ! This routine to modify mesh coordinates
364      include 'SIZE'
365      include 'TOTAL'
366
367      x0 = 0
368      x1 = 1
369      call rescale_x(xm1,x0,x1)
370      call rescale_x(ym1,x0,x1)
371      call rescale_x(zm1,x0,x1)
372
373      param(59)=1  ! Force Nek to use the "deformed element" formulation
374
375      return
376      end
377C-----------------------------------------------------------------------
378      subroutine usrdat3
379      include 'SIZE'
380      include 'TOTAL'
381
382      return
383      end
384C-----------------------------------------------------------------------
385      subroutine xmask1   (r1,h2,nel)
386C     Apply zero Dirichlet boundary conditions
387C     Input: h2,nel                 Output: r1
388      include 'SIZE'
389      include 'TOTAL'
390      real*8 r1(1),h2(1)
391
392      n=nx1*ny1*nz1*nel
393      do i=1,n
394         r1(i)=r1(i)*h2(i)
395      enddo
396
397      return
398      end
399C-----------------------------------------------------------------------
400      function glrdif(x,y,n)
401C     Compute Linfty norm of (x-y)
402C     Input: x,y                    Output: n
403      real*8 x(n),y(n)
404
405      dmx=0
406      xmx=0
407      ymx=0
408
409      do i=1,n
410         diff=abs(x(i)-y(i))
411         dmx =max(dmx,diff)
412         xmx =max(xmx,x(i))
413         ymx =max(ymx,y(i))
414      enddo
415
416      xmx = max(xmx,ymx)
417      dmx = glmax(dmx,1) ! max across processors
418      xmx = glmax(xmx,1)
419
420      if (xmx.gt.0) then
421         glrdif = dmx/xmx
422      else
423         glrdif = -dmx   ! Negative indicates something strange
424      endif
425
426      return
427      end
428C-----------------------------------------------------------------------
429      subroutine loc_grad3(ur,us,ut,u,N,D,Dt)
430C     3D transpose of local gradient
431C     Input: u,N,D,Dt               Output: ur,us,ut
432      real*8 ur(0:N,0:N,0:N),us(0:N,0:N,0:N),ut(0:N,0:N,0:N)
433      real*8 u (0:N,0:N,0:N)
434      real*8 D (0:N,0:N),Dt(0:N,0:N)
435
436      m1 = N+1
437      m2 = m1*m1
438
439      call mxm(D ,m1,u(0,0,0),m1,ur,m2)
440      do k=0,N
441         call mxm(u(0,0,k),m1,Dt,m1,us(0,0,k),m1)
442      enddo
443      call mxm(u(0,0,0),m2,Dt,m1,ut,m1)
444
445      return
446      end
447c-----------------------------------------------------------------------
448      subroutine loc_grad3t(u,ur,us,ut,N,D,Dt,w)
449C     3D transpose of local gradient
450C     Input: ur,us,ut,N,D,Dt        Output: u
451       real*8 u (0:N,0:N,0:N)
452       real*8 ur(0:N,0:N,0:N),us(0:N,0:N,0:N),ut(0:N,0:N,0:N)
453       real*8 D (0:N,0:N),Dt(0:N,0:N)
454       real*8 w (0:N,0:N,0:N)
455
456       m1 = N+1
457       m2 = m1*m1
458       m3 = m1*m1*m1
459
460       call mxm(Dt,m1,ur,m1,u(0,0,0),m2)
461       do k=0,N
462          call mxm(us(0,0,k),m1,D ,m1,w(0,0,k),m1)
463       enddo
464       call add2(u(0,0,0),w,m3)
465       call mxm(ut,m2,D ,m1,w,m1)
466       call add2(u(0,0,0),w,m3)
467
468      return
469      end
470C-----------------------------------------------------------------------
471      subroutine geodatq(gf,bmq,w3mq,nxq)
472C     Routine to generate elemental geometric matrices on mesh 1
473C     (Gauss-Legendre Lobatto mesh).
474      include 'SIZE'
475      include 'TOTAL'
476
477      parameter (lg=3+3*(ldim-2),lzq=lx1+1,lxyd=lzq**ldim)
478
479      real*8 gf(lg,nxq**ldim,lelt),bmq(nxq**ldim,lelt),w3mq(nxq,nxq,nxq)
480
481      common /ctmp1/ xr(lxyd),xs(lxyd),xt(lxyd)
482      common /sxrns/ yr(lxyd),ys(lxyd),yt(lxyd)
483     $ ,             zr(lxyd),zs(lxyd),zt(lxyd)
484
485      common /ctmp77/ wd(lxd),zd(lxd)
486      common /dxmfine/ dxmq(lzq,lzq),dxtmq(lzq,lzq)
487
488      integer e
489      real*8 tmp(lxyd)
490      real*8 a11,a12,a13,a21,a22,a23,a31,a32,a33
491      real*8 g11,g12,g13,g21,g22,g23,g31,g32,g33
492      real*8 jacmq
493
494      if (nxq.gt.lzq) call exitti('ABORT: recompile with lzq=$',nxq)
495
496      call zwgl    (zd,wd,lzq)                            ! nxq = number of points
497      call gen_dgl (dxmq,dxtmq,lzq,lzq,tmp)
498
499      do k=1,nxq
500      do j=1,nxq
501      do i=1,nxq
502         w3mq(i,j,k) = wd(i)*wd(j)*wd(k)
503      enddo
504      enddo
505      enddo
506
507      nxyzq = nxq**ldim
508      nxqm1 = lzq-1
509
510      do e=1,nelt
511         call intp_rstd (tmp,xm1(1,1,1,e),lx1,lzq,if3d,0) ! 0-->Fwd interpolation
512         call loc_grad3 (xr,xs,xt,tmp,nxqm1,dxmq,dxtmq)
513
514         call intp_rstd (tmp,ym1(1,1,1,e),lx1,lzq,if3d,0)
515         call loc_grad3 (yr,ys,yt,tmp,nxqm1,dxmq,dxtmq)
516
517         call intp_rstd (tmp,zm1(1,1,1,e),lx1,lzq,if3d,0)
518         call loc_grad3 (zr,zs,zt,tmp,nxqm1,dxmq,dxtmq)
519
520         do i=1,nxyzq
521            a11 = xr(i)
522            a12 = xs(i)
523            a13 = xt(i)
524
525            a21 = yr(i)
526            a22 = ys(i)
527            a23 = yt(i)
528
529            a31 = zr(i)
530            a32 = zs(i)
531            a33 = zt(i)
532
533            g11 = (a22*a33-a23*a32)
534            g12 = (a13*a32-a33*a12)
535            g13 = (a12*a23-a22*a13)
536
537            g21 = (a23*a31-a21*a33)
538            g22 = (a11*a33-a31*a13)
539            g23 = (a13*a21-a23*a11)
540
541            g31 = (a21*a32-a22*a31)
542            g32 = (a12*a31-a32*a11)
543            g33 = (a11*a22-a21*a12)
544
545            jacmq = a11*g11+a21*g12+a31*g13
546
547            bmq(i,e)  = w3mq(i,1,1)*jacmq
548            scale     = w3mq(i,1,1)/jacmq
549
550            gf(1,i,e) = scale*(g11*g11+g12*g12+g13*g13) ! Grr
551            gf(2,i,e) = scale*(g11*g21+g12*g22+g13*g23) ! Grs
552            gf(3,i,e) = scale*(g11*g31+g12*g32+g13*g33) ! Grt
553            gf(4,i,e) = scale*(g21*g21+g22*g22+g23*g23) ! Gss
554            gf(5,i,e) = scale*(g21*g31+g22*g32+g23*g33) ! Gst
555            gf(6,i,e) = scale*(g31*g31+g32*g32+g33*g33) ! Gtt
556
557         enddo
558      enddo
559
560      return
561      end
562C-----------------------------------------------------------------------
563      subroutine setprecn_bp1 (d,h1,h2)
564C     Generate diagonal preconditioner for Helmholtz operator
565C     Input: h1,h2                  Output: d
566      include 'SIZE'
567      include 'TOTAL'
568
569      parameter (lxyz=lx1*ly1*lz1,lg=3+3*(ldim-2))
570
571      real*8    d(lx1,ly1,lz1,lelt),h1(lxyz,lelt),h2(lxyz,lelt)
572      integer e
573
574      real*8         gf(lg,lx1,ly1,lz1,lelt) ! Equivalence new gf() data
575      equivalence (gf,g1m1)                  ! layout to g1m1...g6m1
576
577      real*8 ysm1(ly1)
578
579      nel   = nelfld(ifield)
580      n     = nel*lx1*ly1*lz1
581      nxyz  = lx1*ly1*lz1
582
583      call copy    (d,bm1,n)   ! Mass matrix preconditioning full mass matrix
584      call dssum   (d,nx1,ny1,nz1)
585      call invcol1 (d,n)
586      return
587
588      call dsset(lx1,ly1,lz1)
589
590      do 1000 e=1,nel
591
592        call rzero(d(1,1,1,e),nxyz)
593
594        do 320 iz=1,lz1
595         do 320 iy=1,ly1
596         do 320 ix=1,lx1
597         do 320 iq=1,lx1
598           d(ix,iy,iz,e) = d(ix,iy,iz,e)
599     $                   + gf(1,iq,iy,iz,e) * dxm1(iq,ix)**2
600     $                   + gf(2,ix,iq,iz,e) * dxm1(iq,iy)**2
601     $                   + gf(3,ix,iy,iq,e) * dxm1(iq,iz)**2
602  320    continue
603C
604C        Add cross terms if element is deformed.
605C
606         if (lxyz.gt.0) then
607
608           do i2=1,ly1,ly1-1
609           do i1=1,lx1,lx1-1
610              d(1,i1,i2,e) = d(1,i1,i2,e)
611     $            + gf(4,1,i1,i2,e) * dxtm1(1,1)*dytm1(i1,i1)
612     $            + gf(5,1,i1,i2,e) * dxtm1(1,1)*dztm1(i2,i2)
613              d(lx1,i1,i2,e) = d(lx1,i1,i2,e)
614     $            + gf(4,lx1,i1,i2,e) * dxtm1(lx1,lx1)*dytm1(i1,i1)
615     $            + gf(5,lx1,i1,i2,e) * dxtm1(lx1,lx1)*dztm1(i2,i2)
616              d(i1,1,i2,e) = d(i1,1,i2,e)
617     $            + gf(4,i1,1,i2,e) * dytm1(1,1)*dxtm1(i1,i1)
618     $            + gf(6,i1,1,i2,e) * dytm1(1,1)*dztm1(i2,i2)
619              d(i1,ly1,i2,e) = d(i1,ly1,i2,e)
620     $            + gf(4,i1,ly1,i2,e) * dytm1(ly1,ly1)*dxtm1(i1,i1)
621     $            + gf(6,i1,ly1,i2,e) * dytm1(ly1,ly1)*dztm1(i2,i2)
622              d(i1,i2,1,e) = d(i1,i2,1,e)
623     $            + gf(5,i1,i2,1,e) * dztm1(1,1)*dxtm1(i1,i1)
624     $            + gf(6,i1,i2,1,e) * dztm1(1,1)*dytm1(i2,i2)
625              d(i1,i2,lz1,e) = d(i1,i2,lz1,e)
626     $            + gf(5,i1,i2,lz1,e) * dztm1(lz1,lz1)*dxtm1(i1,i1)
627     $            + gf(6,i1,i2,lz1,e) * dztm1(lz1,lz1)*dytm1(i2,i2)
628
629           enddo
630           enddo
631         endif
632
633        do i=1,lxyz
634           d(i,1,1,e)=d(i,1,1,e)*h1(i,e)+h2(i,e)*bm1(i,1,1,e)
635        enddo
636
637 1000 continue ! element loop
638
639C     If axisymmetric, add a diagonal term in the radial direction (ISD=2)
640
641      if (ifaxis.and.(isd.eq.2)) then
642         do 1200 e=1,nel
643            if (ifrzer(e)) call mxm(ym1(1,1,1,e),lx1,datm1,ly1,ysm1,1)
644            k=0
645            do 1190 j=1,ly1
646            do 1190 i=1,lx1
647               k=k+1
648               if (ym1(i,j,1,e).ne.0.) then
649                  term1 = bm1(i,j,1,e)/ym1(i,j,1,e)**2
650                  if (ifrzer(e)) then
651                     term2 =  wxm1(i)*wam1(1)*dam1(1,j)
652     $                       *jacm1(i,1,1,e)/ysm1(i)
653                  else
654                     term2 = 0.
655                  endif
656                  d(i,j,1,e) = d(i,j,1,e) + h1(k,e)*(term1+term2)
657               endif
658 1190       continue
659 1200    continue
660
661      endif
662      call dssum   (d,nx1,ny1,nz1)
663      call invcol1 (d,n)
664
665      if (nio.eq.0) write(6,1) n,d(1,1,1,1),h1(1,1),h2(1,1),bm1(1,1,1,1)
666   1  format(i9,1p4e12.4,' diag prec')
667
668      return
669      end
670C-----------------------------------------------------------------------
671      subroutine setprecn_bp3 (d,h1,h2)
672C     Generate dummy diagonal preconditioner for Helmholtz operator
673C     Input: h1,h2                  Output: d
674      include 'SIZE'
675      include 'TOTAL'
676
677      parameter (n=lx1*ly1*lz1*lelt)
678      real*8 d(n),h1(n),h2(n)
679
680      call rone (d,n)
681
682      return
683      end
684C-----------------------------------------------------------------------
685      subroutine userchk
686      include 'SIZE'
687      include 'TOTAL'
688
689      integer bp
690
691      call get_bp(bp)
692
693      if (bp==1) then
694        if (istep.gt.0) call bp1
695      elseif (bp==3) then
696        if (istep.gt.0) call bp3
697      else
698        write(6,*) "INVALID BP SPECIFICED"
699      endif
700
701      return
702      end
703C-----------------------------------------------------------------------
704      subroutine bp1
705C     Solution to BP1 using libCEED
706      include 'SIZE'
707      include 'TOTAL'
708      include 'CTIMER'  ! ifsync
709      include 'FDMH1'
710      include 'ceedf.h'
711
712      parameter (lzq=lx1+1)
713      parameter (lx=lx1*ly1*lz1,lg=3+3*(ldim-2),lq=lzq**ldim)
714      common /bpgfactors/ gf(lg*lq,lelt),bmq(lq,lelt),w3mq(lq)
715
716      parameter (lt=lx1*ly1*lz1*lelt)
717      parameter (ld=lxd*lyd*lzd*lelt)
718      common /vcrns/ u1(lt),r1(lt),r2(lt),r3(lt)
719      common /vcrny/ e1(lt)
720      common /vcrvh/ h1(lt),h2(lx*lelt),pap(3)
721      real*8 coords(ldim*lx*lelt)
722
723      logical ifh3
724      integer*8 nnode
725      integer ceed,err,test
726      character*64 spec
727
728      integer p,q,ncompx,ncompu,enode,lnode
729      integer vec_p1,vec_ap1,vec_qdata,vec_coords,vec_rhs
730      integer stridesu(3),stridesx(3),stridesw(3)
731      integer erstrctu,erstrctx,erstrctw
732      integer basisu,basisx
733      integer qf_mass,qf_setup
734      integer op_mass,op_setup
735      real*8  x,y,z
736      integer*8 offset
737
738      external massf,masssetupf
739
740      ifield = 1
741      nxq    = nx1+1
742      n = nx1*ny1*nz1*nelt
743
744      ifsync = .false.
745
746C     Set up coordinates
747      ii=0
748      do j=0,nelt-1
749      do i=1,lx
750        ii=ii+1
751        x = xm1(ii,1,1,1)
752        y = ym1(ii,1,1,1)
753        z = zm1(ii,1,1,1)
754        coords(i+0*lx+3*j*lx)=x
755        coords(i+1*lx+3*j*lx)=y
756        coords(i+2*lx+3*j*lx)=z
757      enddo
758      enddo
759
760C     Init ceed library
761      call get_spec(spec)
762      call ceedinit(trim(spec)//char(0),ceed,err)
763
764      call get_test(test)
765
766C     Set up Nek geometry data
767      call geodatq       (gf,bmq,w3mq,nxq)         ! Set up gf() arrays
768      call set_h2_as_rhoJac_GL(h2,bmq,nxq)
769
770C     Set up true soln
771      call dist_fld_h1   (e1)
772      call copy          (h1,e1,n)                 ! Save exact soln in h1
773
774C     Set up solver parameters
775      tol       = 1e-10
776      param(22) = tol
777      maxit     = 100
778
779      call nekgsync()
780
781C     Create ceed basis for mesh and computation
782      p=nx1
783      q=p+1
784      ncompu=1
785      ncompx=ldim
786      call ceedbasiscreatetensorh1lagrange(ceed,ndim,ndim,p,q,
787     $  ceed_gauss,basisx,err)
788      call ceedbasiscreatetensorh1lagrange(ceed,ndim,ncompu,p,q,
789     $  ceed_gauss,basisu,err)
790
791C     Create ceed element restrictions for mesh and computation
792      enode=p**ldim
793      lnode=enode*nelt*ncompu
794      stridesx=[1,enode,enode*ldim]
795      call ceedelemrestrictioncreatestrided(ceed,nelt,enode,lnode,
796     $  ldim,stridesx,erstrctx,err)
797      stridesu=[1,enode,enode*ncompu]
798      call ceedelemrestrictioncreatestrided(ceed,nelt,enode,lnode,
799     $  ncompu,stridesu,erstrctu,err)
800      call ceedelemrestrictioncreatestrided(ceed,nelt,q**ldim,
801     $  nelt*q**ldim,1,ceed_strides_backend,erstrctw,err)
802
803C     Create ceed vectors
804      call ceedvectorcreate(ceed,lnode,vec_p1,err)
805      call ceedvectorcreate(ceed,lnode,vec_ap1,err)
806      call ceedvectorcreate(ceed,lnode,vec_rhs,err)
807      call ceedvectorcreate(ceed,ldim*lx*nelt,vec_coords,err)
808      call ceedvectorcreate(ceed,nelt*q**ldim,vec_qdata,err)
809
810      offset=0
811      call ceedvectorsetarray(vec_coords,ceed_mem_host,
812     $  ceed_use_pointer,coords,offset,err)
813
814C     Create ceed qfunctions for masssetupf and massf
815      call ceedqfunctioncreateinterior(ceed,1,masssetupf,
816     $  EXAMPLE_DIR
817     $  //'bps/bps.h:masssetupf',qf_setup,err)
818      call ceedqfunctionaddinput(qf_setup,'x',ncompx,
819     $  ceed_eval_interp,err)
820      call ceedqfunctionaddinput(qf_setup,'dx',ncompx*ldim,
821     $  ceed_eval_grad,err)
822      call ceedqfunctionaddinput(qf_setup,'weight',ncompu,
823     $  ceed_eval_weight,err)
824      call ceedqfunctionaddoutput(qf_setup,'qdata',ncompu,
825     $  ceed_eval_none,err)
826      call ceedqfunctionaddoutput(qf_setup,'rhs',ncompu,
827     $  ceed_eval_interp,err)
828
829      call ceedqfunctioncreateinterior(ceed,1,massf,
830     $  EXAMPLE_DIR
831     $  //'bps/bps.h:massf',qf_mass,err)
832      call ceedqfunctionaddinput(qf_mass,'u',ncompu,
833     $  ceed_eval_interp,err)
834      call ceedqfunctionaddinput(qf_mass,'qdata',ncompu,
835     $  ceed_eval_none,err)
836      call ceedqfunctionaddoutput(qf_mass,'v',ncompu,
837     $  ceed_eval_interp,err)
838
839C     Create ceed operators
840      call ceedoperatorcreate(ceed,qf_setup,
841     $  ceed_qfunction_none,ceed_qfunction_none,op_setup,err)
842      call ceedoperatorsetfield(op_setup,'x',erstrctx,
843     $  basisx,ceed_vector_active,err)
844      call ceedoperatorsetfield(op_setup,'dx',erstrctx,
845     $  basisx,ceed_vector_active,err)
846      call ceedoperatorsetfield(op_setup,'weight',
847     $  ceed_elemrestriction_none,basisx,ceed_vector_none,err)
848      call ceedoperatorsetfield(op_setup,'qdata',erstrctw,
849     $  ceed_basis_collocated,ceed_vector_active,err)
850      call ceedoperatorsetfield(op_setup,'rhs',erstrctu,
851     $  basisu,vec_rhs,err)
852
853      call ceedoperatorcreate(ceed,qf_mass,
854     $  ceed_qfunction_none,ceed_qfunction_none,op_mass,err)
855      call ceedoperatorsetfield(op_mass,'u',erstrctu,
856     $  basisu,ceed_vector_active,err)
857      call ceedoperatorsetfield(op_mass,'qdata',erstrctw,
858     $  ceed_basis_collocated,vec_qdata,err)
859      call ceedoperatorsetfield(op_mass,'v',erstrctu,
860     $  basisu,ceed_vector_active,err)
861
862C     Compute setup data
863      call ceedvectorsetarray(vec_rhs,ceed_mem_host,
864     $  ceed_use_pointer,r1,offset,err)
865      call ceedoperatorapply(op_setup,vec_coords,vec_qdata,
866     $  ceed_request_immediate,err)
867
868C     Set up true RHS
869      call dssum         (r1,nx1,ny1,nz1)          ! r1
870
871C     Set up algebraic RHS with libCEED
872      call ceedvectorsetarray(vec_p1,ceed_mem_host,
873     $  ceed_use_pointer,h1,offset,err)
874      call ceedvectorsetarray(vec_ap1,ceed_mem_host,
875     $  ceed_use_pointer,r2,offset,err)
876      call ceedoperatorapply(op_mass,vec_p1,vec_ap1,
877     $  ceed_request_immediate,err)                ! r2 = A_ceed*h1
878      call dssum         (r2,nx1,ny1,nz1)
879
880C     Set up algebraic RHS with Nek5000
881      call axhm1         (pap,r3,h1,h1,h2,'bp1')   ! r3 = A_nek5k*h1
882      call dssum         (r3,nx1,ny1,nz1)
883
884      call nekgsync()
885
886C     Solve true RHS
887      if (nid.eq.0) write (6,*) "libCEED true RHS"
888      tstart = dnekclock()
889      call cggos(u1,r1,h1,h2,vmult,binvm1,tol,ceed,op_mass,
890     $  vec_p1,vec_ap1,maxit,'bp1')
891      tstop  = dnekclock()
892
893C     Output
894      telaps = (tstop-tstart)
895      maxits = maxit
896      er1 = glrdif(u1,e1,n)
897      if (nid.eq.0) write(6,3) lx1,nelgv,er1,' error ',maxit
898
899      if (test.eq.1.and.nid.eq.0) then
900        if (maxit>=100) then
901          write(6,*) "UNCONVERGED CG"
902        endif
903        if (dabs(er1)>5e-3) then
904          write(6,*) "ERROR IS TOO LARGE"
905        endif
906      endif
907
908      nx     = nx1-1
909      nnode  = nelgt            ! nnodes
910      nnode  = nnode*(nx**ldim) ! nnodes
911      nppp   = nnode/np         ! nnodes/proc
912
913      dofps  = nnode/telaps     ! DOF/sec - scalar form
914      titers = telaps/maxits    ! time per iteration
915      tppp_s = titers/nppp      ! time per iteraton per local point
916
917      if (nid.eq.0) write(6,1) 'case scalar:'
918     $ ,np,nx,nelt,nelgt,nnode,nppp,maxits,telaps,dofps,titers,tppp_s
919
920C     Solve libCEED algebraic RHS
921      if (nid.eq.0) write (6,*) "libCEED algebraic RHS"
922      maxit = 100
923      tstart = dnekclock()
924      call cggos(u1,r2,h1,h2,vmult,binvm1,tol,ceed,op_mass,
925     $  vec_p1,vec_ap1,maxit,'bp1')
926      tstop  = dnekclock()
927
928C     Output
929      telaps = (tstop-tstart)
930      maxits = maxit
931      er1 = glrdif(u1,e1,n)
932      if (nid.eq.0) write(6,3) lx1,nelgv,er1,' error ',maxit
933
934      if (test.eq.1.and.nid.eq.0) then
935        if (maxit>=100) then
936          write(6,*) "UNCONVERGED CG"
937        endif
938        if (dabs(er1)>1e-5) then
939          write(6,*) "ERROR IS TOO LARGE"
940        endif
941      endif
942
943      nx      = nx1-1
944      nnode   = nelgt            ! nnodes
945      nnode   = nnode*(nx**ldim) ! nnodes
946      nppp    = nnode/np         ! nnodes/proc
947
948      dofps   = nnode/telaps     ! DOF/sec - scalar form
949      titers  = telaps/maxits    ! time per iteration
950      tppp_s  = titers/nppp      ! time per iteraton per local point
951
952      if (nid.eq.0) write(6,1) 'case scalar:'
953     $ ,np,nx,nelt,nelgt,nnode,nppp,maxits,telaps,dofps,titers,tppp_s
954
955C     Solve Nek5000 algebraic RHS
956      if (nid.eq.0) write (6,*) "Nek5000 algebraic RHS"
957      maxit = 100
958      tstart = dnekclock()
959      call cggos(u1,r3,h1,h2,vmult,binvm1,tol,ceed,op_mass,
960     $  vec_p1,vec_ap1,maxit,'bp1')
961      tstop  = dnekclock()
962
963C     Output
964      telaps = (tstop-tstart)
965      maxits = maxit
966      er1 = glrdif(u1,e1,n)
967      if (nid.eq.0) write(6,3) lx1,nelgv,er1,' error ',maxit
968
969      if (test.eq.1.and.nid.eq.0) then
970        if (maxit>=100) then
971          write(6,*) "UNCONVERGED CG"
972        endif
973        if (dabs(er1)>1e-5) then
974          write(6,*) "ERROR IS TOO LARGE"
975        endif
976      endif
977
978      nx      = nx1-1
979      nnode   = nelgt            ! nnodes
980      nnode   = nnode*(nx**ldim) ! nnodes
981      nppp    = nnode/np         ! nnodes/proc
982
983      dofps   = nnode/telaps     ! DOF/sec - scalar form
984      titers  = telaps/maxits    ! time per iteration
985      tppp_s  = titers/nppp      ! time per iteraton per local point
986
987      if (nid.eq.0) write(6,1) 'case scalar:'
988     $ ,np,nx,nelt,nelgt,nnode,nppp,maxits,telaps,dofps,titers,tppp_s
989
990    1 format(a12,i7,i3,i7,i10,i14,i10,i4,1p4e13.5)
991    3 format(i3,i9,e12.4,1x,a8,i9)
992
993C     Destroy ceed handles
994      call ceedvectordestroy(vec_p1,err)
995      call ceedvectordestroy(vec_ap1,err)
996      call ceedvectordestroy(vec_rhs,err)
997      call ceedvectordestroy(vec_qdata,err)
998      call ceedvectordestroy(vec_coords,err)
999      call ceedelemrestrictiondestroy(erstrctu,err)
1000      call ceedelemrestrictiondestroy(erstrctx,err)
1001      call ceedelemrestrictiondestroy(erstrctw,err)
1002      call ceedbasisdestroy(basisu,err)
1003      call ceedbasisdestroy(basisx,err)
1004      call ceedqfunctiondestroy(qf_setup,err)
1005      call ceedqfunctiondestroy(qf_mass,err)
1006      call ceedoperatordestroy(op_setup,err)
1007      call ceedoperatordestroy(op_mass,err)
1008      call ceeddestroy(ceed,err)
1009
1010      return
1011      end
1012C-----------------------------------------------------------------------
1013      subroutine bp3
1014C     Solution to BP3 using libCEED
1015      include 'SIZE'
1016      include 'TOTAL'
1017      include 'CTIMER'  ! ifsync
1018      include 'FDMH1'
1019      include 'ceedf.h'
1020
1021      parameter (lzq=lx1+1)
1022      parameter (lx=lx1*ly1*lz1,lg=3+3*(ldim-2),lq=lzq**ldim)
1023      common /bpgfactors/ gf(lg*lq,lelt),bmq(lq,lelt),w3mq(lq)
1024
1025      parameter (lt=lx1*ly1*lz1*lelt)
1026      parameter (ld=lxd*lyd*lzd*lelt)
1027      common /vcrns/ u1(lt),r1(lt),r2(lt),r3(lt)
1028      common /vcrny/ e1(lt)
1029      common /vcrvh/ h1(lt),h2(ld),pap(3)
1030      real*8 coords(ldim*lx*lelt)
1031
1032      logical ifh3
1033      integer*8 nnode
1034      integer ceed,err,test
1035      character*64 spec
1036
1037      integer p,q,ncompx,ncompu,enode,lnode
1038      integer vec_p1,vec_ap1,vec_qdata,vec_coords,vec_rhs
1039      integer stridesu(3),stridesx(3),stridesw(3)
1040      integer erstrctu,erstrctx,erstrctw
1041      integer basisu,basisx
1042      integer qf_diffusion,qf_setup
1043      integer op_diffusion,op_setup
1044      integer ii,i,ngeo
1045      real*8  x,y,z
1046      integer*8 offset
1047
1048      external diffusionf,diffsetupf
1049
1050      ifield = 1
1051      nxq    = nx1+1
1052      n = nx1*ny1*nz1*nelt
1053
1054      ifsync = .false.
1055
1056C     Set up coordinates and mask
1057      ii=0
1058      do j=0,nelt-1
1059      do i=1,lx
1060        ii=ii+1
1061        x = xm1(ii,1,1,1)
1062        y = ym1(ii,1,1,1)
1063        z = zm1(ii,1,1,1)
1064        coords(i+0*lx+3*j*lx)=x
1065        coords(i+1*lx+3*j*lx)=y
1066        coords(i+2*lx+3*j*lx)=z
1067        if ( x.eq.0.or.x.eq.1
1068     $   .or.y.eq.0.or.y.eq.1
1069     $   .or.z.eq.0.or.z.eq.1 ) then
1070          h2(ii)=0.
1071        else
1072          h2(ii)=1.
1073        endif
1074      enddo
1075      enddo
1076
1077C     Init ceed library
1078      call get_spec(spec)
1079      call ceedinit(trim(spec)//char(0),ceed,err)
1080
1081      call get_test(test)
1082
1083C     Set up Nek geometry data
1084      call geodatq       (gf,bmq,w3mq,nxq)         ! Set up gf() arrays
1085
1086C     Set up true soln
1087      call sin_fld_h1    (e1)
1088      call xmask1        (e1,h2,nelt)
1089      call copy          (h1,e1,n)                 ! Save exact soln in h1
1090
1091C     Set up solver parameters
1092      tol       = 1e-10
1093      param(22) = tol
1094      maxit     = 100
1095
1096      call nekgsync()
1097
1098C     Create ceed basis for mesh and computation
1099      p=nx1
1100      q=p+1
1101      ncompu=1
1102      ncompx=ldim
1103      call ceedbasiscreatetensorh1lagrange(ceed,ldim,ncompx,p,q,
1104     $  ceed_gauss,basisx,err)
1105      call ceedbasiscreatetensorh1lagrange(ceed,ldim,ncompu,p,q,
1106     $  ceed_gauss,basisu,err)
1107
1108C     Create ceed element restrictions for mesh and computation
1109      enode=p**ldim
1110      lnode=enode*nelt*ncompu
1111      ngeo=(ldim*(ldim+1))/2
1112      stridesx=[1,enode,enode*ldim]
1113      call ceedelemrestrictioncreatestrided(ceed,nelt,enode,lnode,
1114     $  ldim,stridesx,erstrctx,err)
1115      stridesu=[1,enode,enode*ncompu]
1116      call ceedelemrestrictioncreatestrided(ceed,nelt,enode,lnode,
1117     $  ncompu,stridesu,erstrctu,err)
1118      stridesw=[1,q**ldim,ngeo*q**ldim]
1119      call ceedelemrestrictioncreatestrided(ceed,nelt,q**ldim,
1120     $  nelt*q**ldim,ngeo,stridesw,erstrctw,err)
1121
1122C     Create ceed vectors
1123      call ceedvectorcreate(ceed,lnode,vec_p1,err)
1124      call ceedvectorcreate(ceed,lnode,vec_ap1,err)
1125      call ceedvectorcreate(ceed,lnode,vec_rhs,err)
1126      call ceedvectorcreate(ceed,ldim*lx*nelt,vec_coords,err)
1127      call ceedvectorcreate(ceed,nelt*ngeo*q**ldim,vec_qdata,err)
1128
1129      offset=0
1130      call ceedvectorsetarray(vec_coords,ceed_mem_host,
1131     $  ceed_use_pointer,coords,offset,err)
1132
1133C     Create ceed qfunctions for diffsetupf and diffusionf
1134      call ceedqfunctioncreateinterior(ceed,1,diffsetupf,
1135     $  EXAMPLE_DIR
1136     $  //'bps/bps.h:diffsetupf'//char(0),qf_setup,err)
1137      call ceedqfunctionaddinput(qf_setup,'x',ncompx,
1138     $  ceed_eval_interp,err)
1139      call ceedqfunctionaddinput(qf_setup,'dx',ncompx*ldim,
1140     $  ceed_eval_grad,err)
1141      call ceedqfunctionaddinput(qf_setup,'weight',ncompu,
1142     $  ceed_eval_weight,err)
1143      call ceedqfunctionaddoutput(qf_setup,'qdata',ngeo,
1144     $  ceed_eval_none,err)
1145      call ceedqfunctionaddoutput(qf_setup,'rhs',ncompu,
1146     $  ceed_eval_interp,err)
1147
1148      call ceedqfunctioncreateinterior(ceed,1,diffusionf,
1149     $  EXAMPLE_DIR
1150     $  //'bps/bps.h:diffusionf'//char(0),qf_diffusion,err)
1151      call ceedqfunctionaddinput(qf_diffusion,'u',ncompu*ldim,
1152     $  ceed_eval_grad,err)
1153      call ceedqfunctionaddinput(qf_diffusion,'qdata',ngeo,
1154     $  ceed_eval_none,err)
1155      call ceedqfunctionaddoutput(qf_diffusion,'v',ncompu*ldim,
1156     $  ceed_eval_grad,err)
1157
1158C     Create ceed operators
1159      call ceedoperatorcreate(ceed,qf_setup,
1160     $  ceed_qfunction_none,ceed_qfunction_none,op_setup,err)
1161      call ceedoperatorsetfield(op_setup,'x',erstrctx,
1162     $  basisx,ceed_vector_active,err)
1163      call ceedoperatorsetfield(op_setup,'dx',erstrctx,
1164     $  basisx,ceed_vector_active,err)
1165      call ceedoperatorsetfield(op_setup,'weight',
1166     $  ceed_elemrestriction_none,basisx,ceed_vector_none,err)
1167      call ceedoperatorsetfield(op_setup,'qdata',erstrctw,
1168     $  ceed_basis_collocated,ceed_vector_active,err)
1169      call ceedoperatorsetfield(op_setup,'rhs',erstrctu,
1170     $  basisu,vec_rhs,err)
1171
1172      call ceedoperatorcreate(ceed,qf_diffusion,
1173     $  ceed_qfunction_none,ceed_qfunction_none,op_diffusion,err)
1174      call ceedoperatorsetfield(op_diffusion,'u',erstrctu,
1175     $  basisu,ceed_vector_active,err)
1176      call ceedoperatorsetfield(op_diffusion,'qdata',erstrctw,
1177     $  ceed_basis_collocated,vec_qdata,err)
1178      call ceedoperatorsetfield(op_diffusion,'v',erstrctu,
1179     $  basisu,ceed_vector_active,err)
1180
1181C     Compute setup data
1182      call ceedvectorsetarray(vec_rhs,ceed_mem_host,
1183     $  ceed_use_pointer,r1,offset,err)
1184      call ceedoperatorapply(op_setup,vec_coords,vec_qdata,
1185     $  ceed_request_immediate,err)
1186
1187C     Set up true RHS
1188      call dssum         (r1,nx1,ny1,nz1)          ! r1
1189      call xmask1        (r1,h2,nelt)
1190
1191C     Set up algebraic RHS with libCEED
1192      call ceedvectorsetarray(vec_p1,ceed_mem_host,
1193     $  ceed_use_pointer,h1,offset,err)
1194      call ceedvectorsetarray(vec_ap1,ceed_mem_host,
1195     $  ceed_use_pointer,r2,offset,err)
1196      call ceedoperatorapply(op_diffusion,vec_p1,vec_ap1,
1197     $  ceed_request_immediate,err)                ! r2 = A_ceed*h1
1198      call dssum         (r2,nx1,ny1,nz1)
1199      call xmask1        (r2,h2,nelt)
1200
1201C     Set up algebraic RHS with Nek5000
1202      call axhm1         (pap,r3,h1,h1,h2,'bp3')   ! r3 = A_nek5k*h1
1203      call dssum         (r3,nx1,ny1,nz1)
1204      call xmask1        (r3,h2,nelt)
1205
1206      call nekgsync()
1207
1208C     Solve true RHS
1209      if (nid.eq.0) write (6,*) "libCEED true RHS"
1210      tstart = dnekclock()
1211      call cggos(u1,r1,h1,h2,vmult,binvm1,tol,ceed,op_diffusion,
1212     $  vec_p1,vec_ap1,maxit,'bp3')
1213      tstop  = dnekclock()
1214
1215C     Output
1216      telaps = (tstop-tstart)
1217      maxits = maxit
1218      er1 = glrdif(u1,e1,n)
1219      if (nid.eq.0) write(6,3) lx1,nelgv,er1,' error ',maxit
1220
1221      if (test.eq.1.and.nid.eq.0) then
1222        if (maxit>=100) then
1223          write(6,*) "UNCONVERGED CG"
1224        endif
1225        if (dabs(er1)>1e-3) then
1226          write(6,*) "ERROR IS TOO LARGE"
1227        endif
1228      endif
1229
1230      nx      = nx1-1
1231      nnode   = nelgt            ! nnodes
1232      nnode   = nnode*(nx**ldim) ! nnodes
1233      nppp    = nnode/np         ! nnodes/proc
1234
1235      dofps   = nnode/telaps     ! DOF/sec - scalar form
1236      titers  = telaps/maxits    ! time per iteration
1237      tppp_s  = titers/nppp      ! time per iteraton per local point
1238
1239      if (nid.eq.0) write(6,1) 'case scalar:'
1240     $ ,np,nx,nelt,nelgt,nnode,nppp,maxits,telaps,dofps,titers,tppp_s
1241
1242C     Solve libCEED algebraic RHS
1243      if (nid.eq.0) write (6,*) "libCEED algebraic RHS"
1244      maxit = 100
1245      tstart = dnekclock()
1246      call cggos(u1,r2,h1,h2,vmult,binvm1,tol,ceed,op_diffusion,
1247     $  vec_p1,vec_ap1,maxit,'bp3')
1248      tstop  = dnekclock()
1249
1250C     Output
1251      telaps = (tstop-tstart)
1252      maxits = maxit
1253      er1 = glrdif(u1,e1,n)
1254      if (nid.eq.0) write(6,3) lx1,nelgv,er1,' error ',maxit
1255
1256      if (test.eq.1.and.nid.eq.0) then
1257        if (maxit>=100) then
1258          write(6,*) "UNCONVERGED CG"
1259        endif
1260        if (dabs(er1)>1e-9) then
1261          write(6,*) "ERROR IS TOO LARGE"
1262        endif
1263      endif
1264
1265      nx      = nx1-1
1266      nnode   = nelgt            ! nnodes
1267      nnode   = nnode*(nx**ldim) ! nnodes
1268      nppp    = nnode/np         ! nnodes/proc
1269
1270      dofps   = nnode/telaps     ! DOF/sec - scalar form
1271      titers  = telaps/maxits    ! time per iteration
1272      tppp_s  = titers/nppp      ! time per iteraton per local point
1273
1274      if (nid.eq.0) write(6,1) 'case scalar:'
1275     $ ,np,nx,nelt,nelgt,nnode,nppp,maxits,telaps,dofps,titers,tppp_s
1276
1277C     Solve Nek5000 algebraic RHS
1278      if (nid.eq.0) write (6,*) "Nek5000 algebraic RHS"
1279      maxit = 100
1280      tstart = dnekclock()
1281      call cggos(u1,r3,h1,h2,vmult,binvm1,tol,ceed,op_diffusion,
1282     $  vec_p1,vec_ap1,maxit,'bp3')
1283      tstop  = dnekclock()
1284
1285C     Output
1286      telaps = (tstop-tstart)
1287      maxits = maxit
1288      er1 = glrdif(u1,e1,n)
1289      if (nid.eq.0) write(6,3) lx1,nelgv,er1,' error ',maxit
1290
1291      if (test.eq.1.and.nid.eq.0) then
1292        if (maxit>=100) then
1293          write(6,*) "UNCONVERGED CG"
1294        endif
1295        if (dabs(er1)>1e-9) then
1296          write(6,*) "ERROR IS TOO LARGE"
1297        endif
1298      endif
1299
1300      nx      = nx1-1
1301      nnode   = nelgt            ! nnodes
1302      nnode   = nnode*(nx**ldim) ! nnodes
1303      nppp    = nnode/np         ! nnodes/proc
1304
1305      dofps   = nnode/telaps     ! DOF/sec - scalar form
1306      titers  = telaps/maxits    ! time per iteration
1307      tppp_s  = titers/nppp      ! time per iteraton per local point
1308
1309      if (nid.eq.0) write(6,1) 'case scalar:'
1310     $ ,np,nx,nelt,nelgt,nnode,nppp,maxits,telaps,dofps,titers,tppp_s
1311
1312    1 format(a12,i7,i3,i7,i10,i14,i10,i4,1p4e13.5)
1313    3 format(i3,i9,e12.4,1x,a8,i9)
1314
1315C     Destroy ceed handles
1316      call ceedvectordestroy(vec_p1,err)
1317      call ceedvectordestroy(vec_ap1,err)
1318      call ceedvectordestroy(vec_rhs,err)
1319      call ceedvectordestroy(vec_qdata,err)
1320      call ceedvectordestroy(vec_coords,err)
1321      call ceedelemrestrictiondestroy(erstrctu,err)
1322      call ceedelemrestrictiondestroy(erstrctx,err)
1323      call ceedelemrestrictiondestroy(erstrctw,err)
1324      call ceedbasisdestroy(basisu,err)
1325      call ceedbasisdestroy(basisx,err)
1326      call ceedqfunctiondestroy(qf_setup,err)
1327      call ceedqfunctiondestroy(qf_diffusion,err)
1328      call ceedoperatordestroy(op_setup,err)
1329      call ceedoperatordestroy(op_diffusion,err)
1330      call ceeddestroy(ceed,err)
1331
1332      return
1333      end
1334C-----------------------------------------------------------------------
1335      subroutine cggos(u1,r1,h1,h2,rmult,binv,tin,ceed,ceed_op,vec_p1,
1336     $  vec_ap1,maxit,bpname)
1337C     Scalar conjugate gradient iteration for solution of uncoupled
1338C     Helmholtz equations
1339C     Input: r1,h1,h2,rmult,binv,tin,ceed,ceed_op,vec_p1,vec_ap1,bpname
1340C     Output: u1,maxit
1341      include 'SIZE'
1342      include 'TOTAL'
1343      include 'DOMAIN'
1344      include 'FDMH1'
1345      character*3 bpname
1346
1347C     INPUT:  rhs1 - rhs
1348C             h1   - exact solution
1349
1350      parameter (lt=lx1*ly1*lz1*lelt)
1351      parameter (ld=lxd*lyd*lzd*lelt)
1352      real*8 u1(lt),r1(lt),h1(lt),h2(lt)
1353      real*8 rmult(1),binv(1)
1354      integer ceed,ceed_op,vec_ap1,vec_p1
1355      common /scrcg/ dpc(lt),p1(lt),z1(lt)
1356      common /scrca/ wv(4),wk(4),rpp1(4),rpp2(4),alph(4),beta(4),pap(4)
1357
1358      real*8 ap1(lt)
1359      equivalence (ap1,z1)
1360
1361      vol   = volfld(ifield)
1362      nel   = nelfld(ifield)
1363      nxyz  = lx1*ly1*lz1
1364      n     = nxyz*nel
1365      nx    = nx1-1                  ! Polynomial order (just for i/o)
1366
1367      tol=tin
1368
1369      if(bpname.ne.'bp1') then
1370        call setprecn_bp3(dpc,h1,h2) ! Set up diagional pre-conidtioner
1371      else
1372        call setprecn_bp1(dpc,h1,h2) ! Set up diagional pre-conidtioner
1373      endif
1374
1375      call rzero         (u1,n)      ! Initialize solution
1376
1377      wv(1)=0
1378      do i=1,n
1379         s=rmult(i)                  !      -1
1380         p1(i)=dpc(i)*r1(i)          ! p = M  r      T
1381         wv(1)=wv(1)+s*p1(i)*r1(i)   !              r p
1382      enddo
1383      call gop(wv(1),wk,'+  ',1)
1384      rpp1(1) = wv  (1)
1385
1386      do 1000 iter=1,maxit
1387         call axhm1_ceed (pap,ap1,p1,h1,h2,ceed,ceed_op,
1388     $     vec_ap1,vec_p1)
1389         call dssum    (ap1,nx1,ny1,nz1)
1390         if (bpname.ne.'bp1') call xmask1(ap1,h2,nel)
1391
1392         call gop      (pap,wk,'+  ',1)
1393         alph(1) = rpp1(1)/pap(1)
1394
1395         do i=1,n
1396            u1(i)=u1(i)+alph(1)* p1(i)
1397            r1(i)=r1(i)-alph(1)*ap1(i)
1398         enddo
1399
1400C        tolerance check here
1401         call rzero(wv,2)
1402         do i=1,n
1403            wv(1)=wv(1)+r1(i)*r1(i)            ! L2 error estimate
1404            z1(i)=dpc(i)*r1(i)                 ! z = M  r
1405            wv(2)=wv(2)+rmult(i)*z1(i)*r1(i)   ! r z
1406         enddo
1407         call gop(wv,wk,'+  ',2)
1408
1409C        if (nio.eq.0) write(6,1) ifield,istep,iter,nx,(wv(k),k=1,1)
1410  1     format(i2,i9,i5,i4,1p1e12.4,' cggos')
1411
1412         enorm=sqrt(wv(1))
1413         if (enorm.lt.tol) then
1414            ifin = iter
1415            if (nio.eq.0) write(6,3000) istep,ifin,enorm,tol
1416            goto 9999
1417         endif
1418C        if (nio.eq.0) write(6,2) iter,enorm,alph(1),pap(1),'alpha'
1419 2      format(i5,1p3e12.4,2x,a5)
1420
1421         rpp2(1)=rpp1(1)
1422         rpp1(1)=wv  (2)
1423         beta1  =rpp1(1)/rpp2(1)
1424         do i=1,n
1425            p1(i)=z1(i) + beta1*p1(i)
1426         enddo
1427
1428 1000 continue
1429
1430      rbnorm=sqrt(wv(1))
1431      if (nio.eq.0) write (6,3001) istep,iter,rbnorm,tol
1432      iter = iter-1
1433
1434 9999 continue
1435
1436      maxit=iter
1437
1438 3000 format(i12,1x,'cggo scalar:',i6,1p5e13.4)
1439 3001 format(2i6,' Unconverged cggo scalar: rbnorm =',1p2e13.6)
1440
1441      return
1442      end
1443C-----------------------------------------------------------------------
1444      subroutine axhm1_ceed(pap,ap1,p1,h1,h2,ceed,ceed_op,
1445     $  vec_ap1,vec_p1)
1446C     Vector conjugate gradient matvec for solution of uncoupled
1447C     Helmholtz equations
1448C     Input: pap,p1,h1,h2,bpname,ceed,ceed_op,vec_ap1,vec_p1
1449C     Output: ap1
1450      include 'SIZE'
1451      include 'TOTAL'
1452      include 'ceedf.h'
1453
1454      parameter (lx=lx1*ly1*lz1,lg=3+3*(ldim-2))
1455      real*8       gf(lg,lx,lelt)             ! Equivalence new gf() data
1456      equivalence (gf,g1m1)                   ! layout to g1m1...g6m1
1457
1458      real*8   pap(3)
1459      real*8   ap1(lx,lelt)
1460      real*8    p1(lx,lelt)
1461      real*8    h1(lx,lelt),h2(lx,lelt)
1462      integer ceed,ceed_op,vec_ap1,vec_p1,err
1463      integer i,e
1464      integer*8 offset
1465
1466      offset=0
1467      call ceedvectorsetarray(vec_p1,ceed_mem_host,ceed_use_pointer,
1468     $  p1,offset,err)
1469      call ceedvectorsetarray(vec_ap1,ceed_mem_host,ceed_use_pointer,
1470     $  ap1,offset,err)
1471
1472      call ceedoperatorapply(ceed_op,vec_p1,vec_ap1,
1473     $  ceed_request_immediate,err)
1474
1475      call ceedvectorsyncarray(vec_ap1,ceed_mem_host,err)
1476
1477      pap(1)=0.
1478
1479      do e=1,nelt
1480         do i=1,lx
1481           pap(1)=pap(1)+ap1(i,e)*p1(i,e)
1482         enddo
1483      enddo
1484
1485      return
1486      end
1487C-----------------------------------------------------------------------
1488      subroutine ax_e_bp1(w,u,g,h1,h2,b,ju,us,ut)
1489C     Local matrix-vector for solution of BP3 (stiffness matrix)
1490C     Input: u,g,h1,h2,b,ju,us,ut   Output: w
1491      include 'SIZE'
1492      include 'TOTAL'
1493
1494      parameter (lxyz=lx1*ly1*lz1,lg=3+3*(ldim-2))
1495      real*8 w(lxyz),u(lxyz),g(lg,lxyz),h1(lxyz),h2(lxyz),b(lxyz)
1496      real*8 ju(lxyz),us(lxyz),ut(lxyz)
1497
1498      nxq = nx1+1 ! Number of quadrature points
1499
1500      lxyzq = nxq**ldim
1501
1502      call intp_rstd (ju,u,lx1,nxq,if3d,0) ! 0 --> Fwd interpolation
1503      do i=1,lxyzq
1504         ju(i)=ju(i)*h2(i) !! h2 must be on the fine grid, w/ quad wts
1505      enddo
1506      call intp_rstd (w,ju,lx1,nxq,if3d,1) ! 1 --> ju-->u
1507
1508      return
1509      end
1510C-----------------------------------------------------------------------
1511      subroutine axhm1_bp1(pap,ap1,p1,h1,h2)
1512C     Vector conjugate gradient matvec for solution of BP1 (mass matrix)
1513C     Input: pap,p1,h1,h2           Output: ap1
1514      include 'SIZE'
1515      include 'TOTAL'
1516
1517      parameter (lx=lx1*ly1*lz1,lg=3+3*(ldim-2))
1518      real*8         gf(lg,lx,lelt)             ! Equivalence new gf() data
1519      equivalence (gf,g1m1)                     ! layout to g1m1...g6m1
1520
1521      real*8 pap(3)
1522      real*8 ap1(lx,lelt)
1523      real*8  p1(lx,lelt)
1524      real*8  h1(lx,lelt),h2(lx,lelt)
1525
1526      real*8 ur(lx),us(lx),ut(lx)
1527      common /ctmp1/ ur,us,ut
1528
1529      integer e
1530
1531      pap(1)=0.
1532
1533      k=1
1534      nxq = nx1+1
1535
1536      do e=1,nelt
1537
1538         call ax_e_bp1(ap1(1,e),p1(1,e),gf(1,1,e),h1(1,e),h2(k,1)
1539     $                                          ,bm1(1,1,1,e),ur,us,ut)
1540         do i=1,lx
1541           pap(1)=pap(1)+ap1(i,e)*p1(i,e)
1542         enddo
1543         k=k+nxq*nxq*nxq
1544
1545      enddo
1546
1547      return
1548      end
1549C-----------------------------------------------------------------------
1550      subroutine ax_e_bp3(w,u,g,ur,us,ut,wk)
1551C     Local matrix-vector for solution of BP3 (stiffness matrix)
1552C     Input: u,g,ur,us,ut,wk        Output: w
1553      include 'SIZE'
1554      include 'TOTAL'
1555
1556      parameter (lzq=lx1+1,lxyz=lx1*lx1*lx1,lxyzq=lzq*lzq*lzq)
1557
1558      common /ctmp0/ tmp(lxyzq)
1559      common /dxmfine/ dxmq(lzq,lzq),dxtmq(lzq,lzq)
1560
1561      real*8 ur(lxyzq),us(lxyzq),ut(lxyzq),wk(lxyzq)
1562      real*8 w(lxyz),u(lxyz),g(2*ldim,lxyzq)
1563
1564      n = lzq-1
1565
1566      call intp_rstd  (wk,u,lx1,lzq,if3d,0) ! 0 --> Fwd interpolation
1567      call loc_grad3  (ur,us,ut,wk,n,dxmq,dxtmq)
1568
1569      do i=1,lxyzq
1570         wr = g(1,i)*ur(i) + g(2,i)*us(i) + g(3,i)*ut(i)
1571         ws = g(2,i)*ur(i) + g(4,i)*us(i) + g(5,i)*ut(i)
1572         wt = g(3,i)*ur(i) + g(5,i)*us(i) + g(6,i)*ut(i)
1573         ur(i) = wr
1574         us(i) = ws
1575         ut(i) = wt
1576      enddo
1577
1578      call loc_grad3t (wk,ur,us,ut,n,dxmq,dxtmq,tmp)
1579      call intp_rstd  (w,wk,lx1,lzq,if3d,1) ! 1 --> ju-->u
1580
1581      return
1582      end
1583C-----------------------------------------------------------------------
1584      subroutine axhm1_bp3(pap,ap1,p1,h1,h2)
1585C     Vector conjugate gradient matvec for solution of BP3 (stiffness matrix)
1586C     Input: pap,p1,h1,h2           Output: ap1
1587      include 'SIZE'
1588      include 'TOTAL'
1589
1590      parameter (lzq=lx1+1)
1591      parameter (lx=lx1*ly1*lz1,lg=3+3*(ldim-2),lq=lzq**ldim)
1592      common /bpgfactors/ gf(lg,lq,lelt),bmq(lq,lelt),w3mq(lq)
1593
1594      real*8 pap(3)
1595      real*8 ap1(lx,lelt)
1596      real*8  p1(lx,lelt)
1597      real*8  h1(lx,lelt),h2(lx,lelt)
1598
1599      common /ctmp1/ ur,us,ut,wk
1600      real*8 ur(lq),us(lq),ut(lq),wk(lq)
1601
1602      integer e
1603
1604      pap(1)=0.
1605
1606      do e=1,nelt
1607
1608         call ax_e_bp3(ap1(1,e),p1(1,e),gf(1,1,e),ur,us,ut,wk)
1609         do i=1,lx
1610           pap(1)=pap(1)+p1(i,e)*ap1(i,e)
1611         enddo
1612
1613      enddo
1614
1615      return
1616      end
1617C-----------------------------------------------------------------------
1618      subroutine axhm1(pap,ap1,p1,h1,h2,bpname)
1619C     Vector conjugate gradient matvec for solution of uncoupled
1620C     Helmholtz equations
1621C     Input: pap,p1,h1,h2,bpname    Output: ap1
1622      include 'SIZE'
1623      include 'TOTAL'
1624
1625      parameter (lx=lx1*ly1*lz1)
1626
1627      real*8 pap(3),ap1(lx,lelt),p1(lx,lelt)
1628      real*8 h1(lx,lelt),h2(lx,lelt)
1629
1630      character*3 bpname
1631
1632      if (bpname.eq.'bp1') then
1633         call axhm1_bp1(pap,ap1,p1,h1,h2)
1634
1635      elseif (bpname.eq.'bp3') then
1636         call axhm1_bp3(pap,ap1,p1,h1,h2)
1637
1638      else
1639         write(6,*) bpname,' axhm1 bpname error'
1640         stop
1641
1642      endif
1643
1644      return
1645      end
1646C-----------------------------------------------------------------------
1647      subroutine get_bp(bp)
1648C     Get BP to run
1649C     Input:                        Output: bp
1650      integer i,bp
1651      character*64 bpval
1652
1653      bp=0
1654      if(iargc().ge.1) then
1655        call getarg(1,bpval)
1656      endif
1657      if(bpval.eq."bp1") then
1658        bp=1
1659      elseif(bpval.eq."bp3") then
1660        bp=3
1661      endif
1662
1663      return
1664      end
1665C-----------------------------------------------------------------------
1666      subroutine get_spec(spec)
1667C     Get CEED backend specification
1668C     Input:                        Output: spec
1669      integer i
1670      character*64 spec
1671
1672      spec = '/cpu/self'
1673      if(iargc().ge.2) then
1674        call getarg(2,spec)
1675      endif
1676
1677      return
1678      end
1679C-----------------------------------------------------------------------
1680      subroutine get_test(test)
1681C     Get test mode flag
1682C     Input:                        Output: test
1683      integer i,test
1684      character*64 testval
1685
1686      test=0
1687      if(iargc().ge.3) then
1688        call getarg(3,testval)
1689      endif
1690      if(testval.eq."test") then
1691        test=1
1692      endif
1693
1694      return
1695      end
1696C-----------------------------------------------------------------------
1697
1698