xref: /petsc/src/snes/tutorials/ex48.c (revision 3307d110e72ee4e6d2468971620073eb5ff93529)
1 static const char help[] = "Toy hydrostatic ice flow with multigrid in 3D.\n\
2 \n\
3 Solves the hydrostatic (aka Blatter/Pattyn/First Order) equations for ice sheet flow\n\
4 using multigrid.  The ice uses a power-law rheology with \"Glen\" exponent 3 (corresponds\n\
5 to p=4/3 in a p-Laplacian).  The focus is on ISMIP-HOM experiments which assume periodic\n\
6 boundary conditions in the x- and y-directions.\n\
7 \n\
8 Equations are rescaled so that the domain size and solution are O(1), details of this scaling\n\
9 can be controlled by the options -units_meter, -units_second, and -units_kilogram.\n\
10 \n\
11 A VTK StructuredGrid output file can be written using the option -o filename.vts\n\
12 \n\n";
13 
14 /*
15 The equations for horizontal velocity (u,v) are
16 
17   - [eta (4 u_x + 2 v_y)]_x - [eta (u_y + v_x)]_y - [eta u_z]_z + rho g s_x = 0
18   - [eta (4 v_y + 2 u_x)]_y - [eta (u_y + v_x)]_x - [eta v_z]_z + rho g s_y = 0
19 
20 where
21 
22   eta = B/2 (epsilon + gamma)^((p-2)/2)
23 
24 is the nonlinear effective viscosity with regularization epsilon and hardness parameter B,
25 written in terms of the second invariant
26 
27   gamma = u_x^2 + v_y^2 + u_x v_y + (1/4) (u_y + v_x)^2 + (1/4) u_z^2 + (1/4) v_z^2
28 
29 The surface boundary conditions are the natural conditions.  The basal boundary conditions
30 are either no-slip, or Navier (linear) slip with spatially variant friction coefficient beta^2.
31 
32 In the code, the equations for (u,v) are multiplied through by 1/(rho g) so that residuals are O(1).
33 
34 The discretization is Q1 finite elements, managed by a DMDA.  The grid is never distorted in the
35 map (x,y) plane, but the bed and surface may be bumpy.  This is handled as usual in FEM, through
36 the Jacobian of the coordinate transformation from a reference element to the physical element.
37 
38 Since ice-flow is tightly coupled in the z-direction (within columns), the DMDA is managed
39 specially so that columns are never distributed, and are always contiguous in memory.
40 This amounts to reversing the meaning of X,Y,Z compared to the DMDA's internal interpretation,
41 and then indexing as vec[i][j][k].  The exotic coarse spaces require 2D DMDAs which are made to
42 use compatible domain decomposition relative to the 3D DMDAs.
43 
44 There are two compile-time options:
45 
46   NO_SSE2:
47     If the host supports SSE2, we use integration code that has been vectorized with SSE2
48     intrinsics, unless this macro is defined.  The intrinsics speed up integration by about
49     30% on my architecture (P8700, gcc-4.5 snapshot).
50 
51   COMPUTE_LOWER_TRIANGULAR:
52     The element matrices we assemble are lower-triangular so it is not necessary to compute
53     all entries explicitly.  If this macro is defined, the lower-triangular entries are
54     computed explicitly.
55 
56 */
57 
58 #if defined(PETSC_APPLE_FRAMEWORK)
59 #import <PETSc/petscsnes.h>
60 #import <PETSc/petsc/private/dmdaimpl.h>     /* There is not yet a public interface to manipulate dm->ops */
61 #else
62 
63 #include <petscsnes.h>
64 #include <petsc/private/dmdaimpl.h>     /* There is not yet a public interface to manipulate dm->ops */
65 #endif
66 #include <ctype.h>              /* toupper() */
67 
68 #if defined(__cplusplus) || defined (PETSC_HAVE_WINDOWS_COMPILERS) || defined (__PGI)
69 /*  c++ cannot handle  [_restrict_] notation like C does */
70 #undef PETSC_RESTRICT
71 #define PETSC_RESTRICT
72 #endif
73 
74 #if defined __SSE2__
75 #  include <emmintrin.h>
76 #endif
77 
78 /* The SSE2 kernels are only for PetscScalar=double on architectures that support it */
79 #if !defined NO_SSE2                           \
80      && !defined PETSC_USE_COMPLEX             \
81      && !defined PETSC_USE_REAL_SINGLE         \
82      && !defined PETSC_USE_REAL___FLOAT128     \
83      && !defined PETSC_USE_REAL___FP16         \
84      && defined __SSE2__
85 #define USE_SSE2_KERNELS 1
86 #else
87 #define USE_SSE2_KERNELS 0
88 #endif
89 
90 static PetscClassId THI_CLASSID;
91 
92 typedef enum {QUAD_GAUSS,QUAD_LOBATTO} QuadratureType;
93 static const char      *QuadratureTypes[] = {"gauss","lobatto","QuadratureType","QUAD_",0};
94 PETSC_UNUSED static const PetscReal HexQWeights[8]     = {1,1,1,1,1,1,1,1};
95 PETSC_UNUSED static const PetscReal HexQNodes[]        = {-0.57735026918962573, 0.57735026918962573};
96 #define G 0.57735026918962573
97 #define H (0.5*(1.+G))
98 #define L (0.5*(1.-G))
99 #define M (-0.5)
100 #define P (0.5)
101 /* Special quadrature: Lobatto in horizontal, Gauss in vertical */
102 static const PetscReal HexQInterp_Lobatto[8][8] = {{H,0,0,0,L,0,0,0},
103                                                    {0,H,0,0,0,L,0,0},
104                                                    {0,0,H,0,0,0,L,0},
105                                                    {0,0,0,H,0,0,0,L},
106                                                    {L,0,0,0,H,0,0,0},
107                                                    {0,L,0,0,0,H,0,0},
108                                                    {0,0,L,0,0,0,H,0},
109                                                    {0,0,0,L,0,0,0,H}};
110 static const PetscReal HexQDeriv_Lobatto[8][8][3] = {
111   {{M*H,M*H,M},{P*H,0,0}  ,{0,0,0}    ,{0,P*H,0}  ,{M*L,M*L,P},{P*L,0,0}  ,{0,0,0}    ,{0,P*L,0}  },
112   {{M*H,0,0}  ,{P*H,M*H,M},{0,P*H,0}  ,{0,0,0}    ,{M*L,0,0}  ,{P*L,M*L,P},{0,P*L,0}  ,{0,0,0}    },
113   {{0,0,0}    ,{0,M*H,0}  ,{P*H,P*H,M},{M*H,0,0}  ,{0,0,0}    ,{0,M*L,0}  ,{P*L,P*L,P},{M*L,0,0}  },
114   {{0,M*H,0}  ,{0,0,0}    ,{P*H,0,0}  ,{M*H,P*H,M},{0,M*L,0}  ,{0,0,0}    ,{P*L,0,0}  ,{M*L,P*L,P}},
115   {{M*L,M*L,M},{P*L,0,0}  ,{0,0,0}    ,{0,P*L,0}  ,{M*H,M*H,P},{P*H,0,0}  ,{0,0,0}    ,{0,P*H,0}  },
116   {{M*L,0,0}  ,{P*L,M*L,M},{0,P*L,0}  ,{0,0,0}    ,{M*H,0,0}  ,{P*H,M*H,P},{0,P*H,0}  ,{0,0,0}    },
117   {{0,0,0}    ,{0,M*L,0}  ,{P*L,P*L,M},{M*L,0,0}  ,{0,0,0}    ,{0,M*H,0}  ,{P*H,P*H,P},{M*H,0,0}  },
118   {{0,M*L,0}  ,{0,0,0}    ,{P*L,0,0}  ,{M*L,P*L,M},{0,M*H,0}  ,{0,0,0}    ,{P*H,0,0}  ,{M*H,P*H,P}}};
119 /* Stanndard Gauss */
120 static const PetscReal HexQInterp_Gauss[8][8] = {{H*H*H,L*H*H,L*L*H,H*L*H, H*H*L,L*H*L,L*L*L,H*L*L},
121                                                  {L*H*H,H*H*H,H*L*H,L*L*H, L*H*L,H*H*L,H*L*L,L*L*L},
122                                                  {L*L*H,H*L*H,H*H*H,L*H*H, L*L*L,H*L*L,H*H*L,L*H*L},
123                                                  {H*L*H,L*L*H,L*H*H,H*H*H, H*L*L,L*L*L,L*H*L,H*H*L},
124                                                  {H*H*L,L*H*L,L*L*L,H*L*L, H*H*H,L*H*H,L*L*H,H*L*H},
125                                                  {L*H*L,H*H*L,H*L*L,L*L*L, L*H*H,H*H*H,H*L*H,L*L*H},
126                                                  {L*L*L,H*L*L,H*H*L,L*H*L, L*L*H,H*L*H,H*H*H,L*H*H},
127                                                  {H*L*L,L*L*L,L*H*L,H*H*L, H*L*H,L*L*H,L*H*H,H*H*H}};
128 static const PetscReal HexQDeriv_Gauss[8][8][3] = {
129   {{M*H*H,H*M*H,H*H*M},{P*H*H,L*M*H,L*H*M},{P*L*H,L*P*H,L*L*M},{M*L*H,H*P*H,H*L*M}, {M*H*L,H*M*L,H*H*P},{P*H*L,L*M*L,L*H*P},{P*L*L,L*P*L,L*L*P},{M*L*L,H*P*L,H*L*P}},
130   {{M*H*H,L*M*H,L*H*M},{P*H*H,H*M*H,H*H*M},{P*L*H,H*P*H,H*L*M},{M*L*H,L*P*H,L*L*M}, {M*H*L,L*M*L,L*H*P},{P*H*L,H*M*L,H*H*P},{P*L*L,H*P*L,H*L*P},{M*L*L,L*P*L,L*L*P}},
131   {{M*L*H,L*M*H,L*L*M},{P*L*H,H*M*H,H*L*M},{P*H*H,H*P*H,H*H*M},{M*H*H,L*P*H,L*H*M}, {M*L*L,L*M*L,L*L*P},{P*L*L,H*M*L,H*L*P},{P*H*L,H*P*L,H*H*P},{M*H*L,L*P*L,L*H*P}},
132   {{M*L*H,H*M*H,H*L*M},{P*L*H,L*M*H,L*L*M},{P*H*H,L*P*H,L*H*M},{M*H*H,H*P*H,H*H*M}, {M*L*L,H*M*L,H*L*P},{P*L*L,L*M*L,L*L*P},{P*H*L,L*P*L,L*H*P},{M*H*L,H*P*L,H*H*P}},
133   {{M*H*L,H*M*L,H*H*M},{P*H*L,L*M*L,L*H*M},{P*L*L,L*P*L,L*L*M},{M*L*L,H*P*L,H*L*M}, {M*H*H,H*M*H,H*H*P},{P*H*H,L*M*H,L*H*P},{P*L*H,L*P*H,L*L*P},{M*L*H,H*P*H,H*L*P}},
134   {{M*H*L,L*M*L,L*H*M},{P*H*L,H*M*L,H*H*M},{P*L*L,H*P*L,H*L*M},{M*L*L,L*P*L,L*L*M}, {M*H*H,L*M*H,L*H*P},{P*H*H,H*M*H,H*H*P},{P*L*H,H*P*H,H*L*P},{M*L*H,L*P*H,L*L*P}},
135   {{M*L*L,L*M*L,L*L*M},{P*L*L,H*M*L,H*L*M},{P*H*L,H*P*L,H*H*M},{M*H*L,L*P*L,L*H*M}, {M*L*H,L*M*H,L*L*P},{P*L*H,H*M*H,H*L*P},{P*H*H,H*P*H,H*H*P},{M*H*H,L*P*H,L*H*P}},
136   {{M*L*L,H*M*L,H*L*M},{P*L*L,L*M*L,L*L*M},{P*H*L,L*P*L,L*H*M},{M*H*L,H*P*L,H*H*M}, {M*L*H,H*M*H,H*L*P},{P*L*H,L*M*H,L*L*P},{P*H*H,L*P*H,L*H*P},{M*H*H,H*P*H,H*H*P}}};
137 static const PetscReal (*HexQInterp)[8],(*HexQDeriv)[8][3];
138 /* Standard 2x2 Gauss quadrature for the bottom layer. */
139 static const PetscReal QuadQInterp[4][4] = {{H*H,L*H,L*L,H*L},
140                                             {L*H,H*H,H*L,L*L},
141                                             {L*L,H*L,H*H,L*H},
142                                             {H*L,L*L,L*H,H*H}};
143 static const PetscReal QuadQDeriv[4][4][2] = {
144   {{M*H,M*H},{P*H,M*L},{P*L,P*L},{M*L,P*H}},
145   {{M*H,M*L},{P*H,M*H},{P*L,P*H},{M*L,P*L}},
146   {{M*L,M*L},{P*L,M*H},{P*H,P*H},{M*H,P*L}},
147   {{M*L,M*H},{P*L,M*L},{P*H,P*L},{M*H,P*H}}};
148 #undef G
149 #undef H
150 #undef L
151 #undef M
152 #undef P
153 
154 #define HexExtract(x,i,j,k,n) do {              \
155     (n)[0] = (x)[i][j][k];                      \
156     (n)[1] = (x)[i+1][j][k];                    \
157     (n)[2] = (x)[i+1][j+1][k];                  \
158     (n)[3] = (x)[i][j+1][k];                    \
159     (n)[4] = (x)[i][j][k+1];                    \
160     (n)[5] = (x)[i+1][j][k+1];                  \
161     (n)[6] = (x)[i+1][j+1][k+1];                \
162     (n)[7] = (x)[i][j+1][k+1];                  \
163   } while (0)
164 
165 #define HexExtractRef(x,i,j,k,n) do {           \
166     (n)[0] = &(x)[i][j][k];                     \
167     (n)[1] = &(x)[i+1][j][k];                   \
168     (n)[2] = &(x)[i+1][j+1][k];                 \
169     (n)[3] = &(x)[i][j+1][k];                   \
170     (n)[4] = &(x)[i][j][k+1];                   \
171     (n)[5] = &(x)[i+1][j][k+1];                 \
172     (n)[6] = &(x)[i+1][j+1][k+1];               \
173     (n)[7] = &(x)[i][j+1][k+1];                 \
174   } while (0)
175 
176 #define QuadExtract(x,i,j,n) do {               \
177     (n)[0] = (x)[i][j];                         \
178     (n)[1] = (x)[i+1][j];                       \
179     (n)[2] = (x)[i+1][j+1];                     \
180     (n)[3] = (x)[i][j+1];                       \
181   } while (0)
182 
183 static void HexGrad(const PetscReal dphi[][3],const PetscReal zn[],PetscReal dz[])
184 {
185   PetscInt i;
186   dz[0] = dz[1] = dz[2] = 0;
187   for (i=0; i<8; i++) {
188     dz[0] += dphi[i][0] * zn[i];
189     dz[1] += dphi[i][1] * zn[i];
190     dz[2] += dphi[i][2] * zn[i];
191   }
192 }
193 
194 static void HexComputeGeometry(PetscInt q,PetscReal hx,PetscReal hy,const PetscReal dz[PETSC_RESTRICT],PetscReal phi[PETSC_RESTRICT],PetscReal dphi[PETSC_RESTRICT][3],PetscReal *PETSC_RESTRICT jw)
195 {
196   const PetscReal jac[3][3]  = {{hx/2,0,0}, {0,hy/2,0}, {dz[0],dz[1],dz[2]}};
197   const PetscReal ijac[3][3] = {{1/jac[0][0],0,0}, {0,1/jac[1][1],0}, {-jac[2][0]/(jac[0][0]*jac[2][2]),-jac[2][1]/(jac[1][1]*jac[2][2]),1/jac[2][2]}};
198   const PetscReal jdet       = jac[0][0]*jac[1][1]*jac[2][2];
199   PetscInt        i;
200 
201   for (i=0; i<8; i++) {
202     const PetscReal *dphir = HexQDeriv[q][i];
203     phi[i]     = HexQInterp[q][i];
204     dphi[i][0] = dphir[0]*ijac[0][0] + dphir[1]*ijac[1][0] + dphir[2]*ijac[2][0];
205     dphi[i][1] = dphir[0]*ijac[0][1] + dphir[1]*ijac[1][1] + dphir[2]*ijac[2][1];
206     dphi[i][2] = dphir[0]*ijac[0][2] + dphir[1]*ijac[1][2] + dphir[2]*ijac[2][2];
207   }
208   *jw = 1.0 * jdet;
209 }
210 
211 typedef struct _p_THI   *THI;
212 typedef struct _n_Units *Units;
213 
214 typedef struct {
215   PetscScalar u,v;
216 } Node;
217 
218 typedef struct {
219   PetscScalar b;                /* bed */
220   PetscScalar h;                /* thickness */
221   PetscScalar beta2;            /* friction */
222 } PrmNode;
223 
224 typedef struct {
225   PetscReal min,max,cmin,cmax;
226 } PRange;
227 
228 typedef enum {THIASSEMBLY_TRIDIAGONAL,THIASSEMBLY_FULL} THIAssemblyMode;
229 
230 struct _p_THI {
231   PETSCHEADER(int);
232   void      (*initialize)(THI,PetscReal x,PetscReal y,PrmNode *p);
233   PetscInt  zlevels;
234   PetscReal Lx,Ly,Lz;           /* Model domain */
235   PetscReal alpha;              /* Bed angle */
236   Units     units;
237   PetscReal dirichlet_scale;
238   PetscReal ssa_friction_scale;
239   PRange    eta;
240   PRange    beta2;
241   struct {
242     PetscReal Bd2,eps,exponent;
243   } viscosity;
244   struct {
245     PetscReal irefgam,eps2,exponent,refvel,epsvel;
246   } friction;
247   PetscReal rhog;
248   PetscBool no_slip;
249   PetscBool tridiagonal;
250   PetscBool coarse2d;
251   PetscBool verbose;
252   MatType   mattype;
253 };
254 
255 struct _n_Units {
256   /* fundamental */
257   PetscReal meter;
258   PetscReal kilogram;
259   PetscReal second;
260   /* derived */
261   PetscReal Pascal;
262   PetscReal year;
263 };
264 
265 static PetscErrorCode THIJacobianLocal_3D_Full(DMDALocalInfo*,Node***,Mat,Mat,THI);
266 static PetscErrorCode THIJacobianLocal_3D_Tridiagonal(DMDALocalInfo*,Node***,Mat,Mat,THI);
267 static PetscErrorCode THIJacobianLocal_2D(DMDALocalInfo*,Node**,Mat,Mat,THI);
268 
269 static void PrmHexGetZ(const PrmNode pn[],PetscInt k,PetscInt zm,PetscReal zn[])
270 {
271   const PetscScalar zm1    = zm-1,
272                     znl[8] = {pn[0].b + pn[0].h*(PetscScalar)k/zm1,
273                               pn[1].b + pn[1].h*(PetscScalar)k/zm1,
274                               pn[2].b + pn[2].h*(PetscScalar)k/zm1,
275                               pn[3].b + pn[3].h*(PetscScalar)k/zm1,
276                               pn[0].b + pn[0].h*(PetscScalar)(k+1)/zm1,
277                               pn[1].b + pn[1].h*(PetscScalar)(k+1)/zm1,
278                               pn[2].b + pn[2].h*(PetscScalar)(k+1)/zm1,
279                               pn[3].b + pn[3].h*(PetscScalar)(k+1)/zm1};
280   PetscInt i;
281   for (i=0; i<8; i++) zn[i] = PetscRealPart(znl[i]);
282 }
283 
284 /* Tests A and C are from the ISMIP-HOM paper (Pattyn et al. 2008) */
285 static void THIInitialize_HOM_A(THI thi,PetscReal x,PetscReal y,PrmNode *p)
286 {
287   Units     units = thi->units;
288   PetscReal s     = -x*PetscSinReal(thi->alpha);
289 
290   p->b     = s - 1000*units->meter + 500*units->meter * PetscSinReal(x*2*PETSC_PI/thi->Lx) * PetscSinReal(y*2*PETSC_PI/thi->Ly);
291   p->h     = s - p->b;
292   p->beta2 = 1e30;
293 }
294 
295 static void THIInitialize_HOM_C(THI thi,PetscReal x,PetscReal y,PrmNode *p)
296 {
297   Units     units = thi->units;
298   PetscReal s     = -x*PetscSinReal(thi->alpha);
299 
300   p->b = s - 1000*units->meter;
301   p->h = s - p->b;
302   /* tau_b = beta2 v   is a stress (Pa) */
303   p->beta2 = 1000 * (1 + PetscSinReal(x*2*PETSC_PI/thi->Lx)*PetscSinReal(y*2*PETSC_PI/thi->Ly)) * units->Pascal * units->year / units->meter;
304 }
305 
306 /* These are just toys */
307 
308 /* Same bed as test A, free slip everywhere except for a discontinuous jump to a circular sticky region in the middle. */
309 static void THIInitialize_HOM_X(THI thi,PetscReal xx,PetscReal yy,PrmNode *p)
310 {
311   Units     units = thi->units;
312   PetscReal x     = xx*2*PETSC_PI/thi->Lx - PETSC_PI,y = yy*2*PETSC_PI/thi->Ly - PETSC_PI; /* [-pi,pi] */
313   PetscReal r     = PetscSqrtReal(x*x + y*y),s = -x*PetscSinReal(thi->alpha);
314   p->b     = s - 1000*units->meter + 500*units->meter*PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
315   p->h     = s - p->b;
316   p->beta2 = 1000 * (r < 1 ? 2 : 0) * units->Pascal * units->year / units->meter;
317 }
318 
319 /* Like Z, but with 200 meter cliffs */
320 static void THIInitialize_HOM_Y(THI thi,PetscReal xx,PetscReal yy,PrmNode *p)
321 {
322   Units     units = thi->units;
323   PetscReal x     = xx*2*PETSC_PI/thi->Lx - PETSC_PI,y = yy*2*PETSC_PI/thi->Ly - PETSC_PI; /* [-pi,pi] */
324   PetscReal r     = PetscSqrtReal(x*x + y*y),s = -x*PetscSinReal(thi->alpha);
325 
326   p->b = s - 1000*units->meter + 500*units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
327   if (PetscRealPart(p->b) > -700*units->meter) p->b += 200*units->meter;
328   p->h     = s - p->b;
329   p->beta2 = 1000 * (1. + PetscSinReal(PetscSqrtReal(16*r))/PetscSqrtReal(1e-2 + 16*r)*PetscCosReal(x*3/2)*PetscCosReal(y*3/2)) * units->Pascal * units->year / units->meter;
330 }
331 
332 /* Same bed as A, smoothly varying slipperiness, similar to MATLAB's "sombrero" (uncorrelated with bathymetry) */
333 static void THIInitialize_HOM_Z(THI thi,PetscReal xx,PetscReal yy,PrmNode *p)
334 {
335   Units     units = thi->units;
336   PetscReal x     = xx*2*PETSC_PI/thi->Lx - PETSC_PI,y = yy*2*PETSC_PI/thi->Ly - PETSC_PI; /* [-pi,pi] */
337   PetscReal r     = PetscSqrtReal(x*x + y*y),s = -x*PetscSinReal(thi->alpha);
338 
339   p->b     = s - 1000*units->meter + 500*units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
340   p->h     = s - p->b;
341   p->beta2 = 1000 * (1. + PetscSinReal(PetscSqrtReal(16*r))/PetscSqrtReal(1e-2 + 16*r)*PetscCosReal(x*3/2)*PetscCosReal(y*3/2)) * units->Pascal * units->year / units->meter;
342 }
343 
344 static void THIFriction(THI thi,PetscReal rbeta2,PetscReal gam,PetscReal *beta2,PetscReal *dbeta2)
345 {
346   if (thi->friction.irefgam == 0) {
347     Units units = thi->units;
348     thi->friction.irefgam = 1./(0.5*PetscSqr(thi->friction.refvel * units->meter / units->year));
349     thi->friction.eps2    = 0.5*PetscSqr(thi->friction.epsvel * units->meter / units->year) * thi->friction.irefgam;
350   }
351   if (thi->friction.exponent == 0) {
352     *beta2  = rbeta2;
353     *dbeta2 = 0;
354   } else {
355     *beta2  = rbeta2 * PetscPowReal(thi->friction.eps2 + gam*thi->friction.irefgam,thi->friction.exponent);
356     *dbeta2 = thi->friction.exponent * *beta2 / (thi->friction.eps2 + gam*thi->friction.irefgam) * thi->friction.irefgam;
357   }
358 }
359 
360 static void THIViscosity(THI thi,PetscReal gam,PetscReal *eta,PetscReal *deta)
361 {
362   PetscReal Bd2,eps,exponent;
363   if (thi->viscosity.Bd2 == 0) {
364     Units units = thi->units;
365     const PetscReal
366       n = 3.,                                           /* Glen exponent */
367       p = 1. + 1./n,                                    /* for Stokes */
368       A = 1.e-16 * PetscPowReal(units->Pascal,-n) / units->year, /* softness parameter (Pa^{-n}/s) */
369       B = PetscPowReal(A,-1./n);                                 /* hardness parameter */
370     thi->viscosity.Bd2      = B/2;
371     thi->viscosity.exponent = (p-2)/2;
372     thi->viscosity.eps      = 0.5*PetscSqr(1e-5 / units->year);
373   }
374   Bd2      = thi->viscosity.Bd2;
375   exponent = thi->viscosity.exponent;
376   eps      = thi->viscosity.eps;
377   *eta     = Bd2 * PetscPowReal(eps + gam,exponent);
378   *deta    = exponent * (*eta) / (eps + gam);
379 }
380 
381 static void RangeUpdate(PetscReal *min,PetscReal *max,PetscReal x)
382 {
383   if (x < *min) *min = x;
384   if (x > *max) *max = x;
385 }
386 
387 static void PRangeClear(PRange *p)
388 {
389   p->cmin = p->min = 1e100;
390   p->cmax = p->max = -1e100;
391 }
392 
393 static PetscErrorCode PRangeMinMax(PRange *p,PetscReal min,PetscReal max)
394 {
395   PetscFunctionBeginUser;
396   p->cmin = min;
397   p->cmax = max;
398   if (min < p->min) p->min = min;
399   if (max > p->max) p->max = max;
400   PetscFunctionReturn(0);
401 }
402 
403 static PetscErrorCode THIDestroy(THI *thi)
404 {
405   PetscFunctionBeginUser;
406   if (!*thi) PetscFunctionReturn(0);
407   if (--((PetscObject)(*thi))->refct > 0) {*thi = 0; PetscFunctionReturn(0);}
408   PetscCall(PetscFree((*thi)->units));
409   PetscCall(PetscFree((*thi)->mattype));
410   PetscCall(PetscHeaderDestroy(thi));
411   PetscFunctionReturn(0);
412 }
413 
414 static PetscErrorCode THICreate(MPI_Comm comm,THI *inthi)
415 {
416   static PetscBool registered = PETSC_FALSE;
417   THI              thi;
418   Units            units;
419 
420   PetscFunctionBeginUser;
421   *inthi = 0;
422   if (!registered) {
423     PetscCall(PetscClassIdRegister("Toy Hydrostatic Ice",&THI_CLASSID));
424     registered = PETSC_TRUE;
425   }
426   PetscCall(PetscHeaderCreate(thi,THI_CLASSID,"THI","Toy Hydrostatic Ice","",comm,THIDestroy,0));
427 
428   PetscCall(PetscNew(&thi->units));
429   units           = thi->units;
430   units->meter    = 1e-2;
431   units->second   = 1e-7;
432   units->kilogram = 1e-12;
433 
434   PetscOptionsBegin(comm,NULL,"Scaled units options","");
435   {
436     PetscCall(PetscOptionsReal("-units_meter","1 meter in scaled length units","",units->meter,&units->meter,NULL));
437     PetscCall(PetscOptionsReal("-units_second","1 second in scaled time units","",units->second,&units->second,NULL));
438     PetscCall(PetscOptionsReal("-units_kilogram","1 kilogram in scaled mass units","",units->kilogram,&units->kilogram,NULL));
439   }
440   PetscOptionsEnd();
441   units->Pascal = units->kilogram / (units->meter * PetscSqr(units->second));
442   units->year   = 31556926. * units->second; /* seconds per year */
443 
444   thi->Lx              = 10.e3;
445   thi->Ly              = 10.e3;
446   thi->Lz              = 1000;
447   thi->dirichlet_scale = 1;
448   thi->verbose         = PETSC_FALSE;
449 
450   PetscOptionsBegin(comm,NULL,"Toy Hydrostatic Ice options","");
451   {
452     QuadratureType quad       = QUAD_GAUSS;
453     char           homexp[]   = "A";
454     char           mtype[256] = MATSBAIJ;
455     PetscReal      L,m = 1.0;
456     PetscBool      flg;
457     L    = thi->Lx;
458     PetscCall(PetscOptionsReal("-thi_L","Domain size (m)","",L,&L,&flg));
459     if (flg) thi->Lx = thi->Ly = L;
460     PetscCall(PetscOptionsReal("-thi_Lx","X Domain size (m)","",thi->Lx,&thi->Lx,NULL));
461     PetscCall(PetscOptionsReal("-thi_Ly","Y Domain size (m)","",thi->Ly,&thi->Ly,NULL));
462     PetscCall(PetscOptionsReal("-thi_Lz","Z Domain size (m)","",thi->Lz,&thi->Lz,NULL));
463     PetscCall(PetscOptionsString("-thi_hom","ISMIP-HOM experiment (A or C)","",homexp,homexp,sizeof(homexp),NULL));
464     switch (homexp[0] = toupper(homexp[0])) {
465     case 'A':
466       thi->initialize = THIInitialize_HOM_A;
467       thi->no_slip    = PETSC_TRUE;
468       thi->alpha      = 0.5;
469       break;
470     case 'C':
471       thi->initialize = THIInitialize_HOM_C;
472       thi->no_slip    = PETSC_FALSE;
473       thi->alpha      = 0.1;
474       break;
475     case 'X':
476       thi->initialize = THIInitialize_HOM_X;
477       thi->no_slip    = PETSC_FALSE;
478       thi->alpha      = 0.3;
479       break;
480     case 'Y':
481       thi->initialize = THIInitialize_HOM_Y;
482       thi->no_slip    = PETSC_FALSE;
483       thi->alpha      = 0.5;
484       break;
485     case 'Z':
486       thi->initialize = THIInitialize_HOM_Z;
487       thi->no_slip    = PETSC_FALSE;
488       thi->alpha      = 0.5;
489       break;
490     default:
491       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"HOM experiment '%c' not implemented",homexp[0]);
492     }
493     PetscCall(PetscOptionsEnum("-thi_quadrature","Quadrature to use for 3D elements","",QuadratureTypes,(PetscEnum)quad,(PetscEnum*)&quad,NULL));
494     switch (quad) {
495     case QUAD_GAUSS:
496       HexQInterp = HexQInterp_Gauss;
497       HexQDeriv  = HexQDeriv_Gauss;
498       break;
499     case QUAD_LOBATTO:
500       HexQInterp = HexQInterp_Lobatto;
501       HexQDeriv  = HexQDeriv_Lobatto;
502       break;
503     }
504     PetscCall(PetscOptionsReal("-thi_alpha","Bed angle (degrees)","",thi->alpha,&thi->alpha,NULL));
505 
506     thi->friction.refvel = 100.;
507     thi->friction.epsvel = 1.;
508 
509     PetscCall(PetscOptionsReal("-thi_friction_refvel","Reference velocity for sliding","",thi->friction.refvel,&thi->friction.refvel,NULL));
510     PetscCall(PetscOptionsReal("-thi_friction_epsvel","Regularization velocity for sliding","",thi->friction.epsvel,&thi->friction.epsvel,NULL));
511     PetscCall(PetscOptionsReal("-thi_friction_m","Friction exponent, 0=Coulomb, 1=Navier","",m,&m,NULL));
512 
513     thi->friction.exponent = (m-1)/2;
514 
515     PetscCall(PetscOptionsReal("-thi_dirichlet_scale","Scale Dirichlet boundary conditions by this factor","",thi->dirichlet_scale,&thi->dirichlet_scale,NULL));
516     PetscCall(PetscOptionsReal("-thi_ssa_friction_scale","Scale slip boundary conditions by this factor in SSA (2D) assembly","",thi->ssa_friction_scale,&thi->ssa_friction_scale,NULL));
517     PetscCall(PetscOptionsBool("-thi_coarse2d","Use a 2D coarse space corresponding to SSA","",thi->coarse2d,&thi->coarse2d,NULL));
518     PetscCall(PetscOptionsBool("-thi_tridiagonal","Assemble a tridiagonal system (column coupling only) on the finest level","",thi->tridiagonal,&thi->tridiagonal,NULL));
519     PetscCall(PetscOptionsFList("-thi_mat_type","Matrix type","MatSetType",MatList,mtype,(char*)mtype,sizeof(mtype),NULL));
520     PetscCall(PetscStrallocpy(mtype,(char**)&thi->mattype));
521     PetscCall(PetscOptionsBool("-thi_verbose","Enable verbose output (like matrix sizes and statistics)","",thi->verbose,&thi->verbose,NULL));
522   }
523   PetscOptionsEnd();
524 
525   /* dimensionalize */
526   thi->Lx    *= units->meter;
527   thi->Ly    *= units->meter;
528   thi->Lz    *= units->meter;
529   thi->alpha *= PETSC_PI / 180;
530 
531   PRangeClear(&thi->eta);
532   PRangeClear(&thi->beta2);
533 
534   {
535     PetscReal u       = 1000*units->meter/(3e7*units->second),
536               gradu   = u / (100*units->meter),eta,deta,
537               rho     = 910 * units->kilogram/PetscPowReal(units->meter,3),
538               grav    = 9.81 * units->meter/PetscSqr(units->second),
539               driving = rho * grav * PetscSinReal(thi->alpha) * 1000*units->meter;
540     THIViscosity(thi,0.5*gradu*gradu,&eta,&deta);
541     thi->rhog = rho * grav;
542     if (thi->verbose) {
543       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Units: meter %8.2g  second %8.2g  kg %8.2g  Pa %8.2g\n",(double)units->meter,(double)units->second,(double)units->kilogram,(double)units->Pascal));
544       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Domain (%6.2g,%6.2g,%6.2g), pressure %8.2g, driving stress %8.2g\n",(double)thi->Lx,(double)thi->Ly,(double)thi->Lz,(double)(rho*grav*1e3*units->meter),(double)driving));
545       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Large velocity 1km/a %8.2g, velocity gradient %8.2g, eta %8.2g, stress %8.2g, ratio %8.2g\n",(double)u,(double)gradu,(double)eta,(double)(2*eta*gradu),(double)(2*eta*gradu/driving)));
546       THIViscosity(thi,0.5*PetscSqr(1e-3*gradu),&eta,&deta);
547       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Small velocity 1m/a  %8.2g, velocity gradient %8.2g, eta %8.2g, stress %8.2g, ratio %8.2g\n",(double)(1e-3*u),(double)(1e-3*gradu),(double)eta,(double)(2*eta*1e-3*gradu),(double)(2*eta*1e-3*gradu/driving)));
548     }
549   }
550 
551   *inthi = thi;
552   PetscFunctionReturn(0);
553 }
554 
555 static PetscErrorCode THIInitializePrm(THI thi,DM da2prm,Vec prm)
556 {
557   PrmNode        **p;
558   PetscInt       i,j,xs,xm,ys,ym,mx,my;
559 
560   PetscFunctionBeginUser;
561   PetscCall(DMDAGetGhostCorners(da2prm,&ys,&xs,0,&ym,&xm,0));
562   PetscCall(DMDAGetInfo(da2prm,0, &my,&mx,0, 0,0,0, 0,0,0,0,0,0));
563   PetscCall(DMDAVecGetArray(da2prm,prm,&p));
564   for (i=xs; i<xs+xm; i++) {
565     for (j=ys; j<ys+ym; j++) {
566       PetscReal xx = thi->Lx*i/mx,yy = thi->Ly*j/my;
567       thi->initialize(thi,xx,yy,&p[i][j]);
568     }
569   }
570   PetscCall(DMDAVecRestoreArray(da2prm,prm,&p));
571   PetscFunctionReturn(0);
572 }
573 
574 static PetscErrorCode THISetUpDM(THI thi,DM dm)
575 {
576   PetscInt        refinelevel,coarsenlevel,level,dim,Mx,My,Mz,mx,my,s;
577   DMDAStencilType st;
578   DM              da2prm;
579   Vec             X;
580 
581   PetscFunctionBeginUser;
582   PetscCall(DMDAGetInfo(dm,&dim, &Mz,&My,&Mx, 0,&my,&mx, 0,&s,0,0,0,&st));
583   if (dim == 2) {
584     PetscCall(DMDAGetInfo(dm,&dim, &My,&Mx,0, &my,&mx,0, 0,&s,0,0,0,&st));
585   }
586   PetscCall(DMGetRefineLevel(dm,&refinelevel));
587   PetscCall(DMGetCoarsenLevel(dm,&coarsenlevel));
588   level = refinelevel - coarsenlevel;
589   PetscCall(DMDACreate2d(PetscObjectComm((PetscObject)thi),DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC,st,My,Mx,my,mx,sizeof(PrmNode)/sizeof(PetscScalar),s,0,0,&da2prm));
590   PetscCall(DMSetUp(da2prm));
591   PetscCall(DMCreateLocalVector(da2prm,&X));
592   {
593     PetscReal Lx = thi->Lx / thi->units->meter,Ly = thi->Ly / thi->units->meter,Lz = thi->Lz / thi->units->meter;
594     if (dim == 2) {
595       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Level %" PetscInt_FMT " domain size (m) %8.2g x %8.2g, num elements %" PetscInt_FMT " x %" PetscInt_FMT " (%" PetscInt_FMT "), size (m) %g x %g\n",level,(double)Lx,(double)Ly,Mx,My,Mx*My,(double)(Lx/Mx),(double)(Ly/My)));
596     } else {
597       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Level %" PetscInt_FMT " domain size (m) %8.2g x %8.2g x %8.2g, num elements %" PetscInt_FMT " x %" PetscInt_FMT " x %" PetscInt_FMT " (%" PetscInt_FMT "), size (m) %g x %g x %g\n",level,(double)Lx,(double)Ly,(double)Lz,Mx,My,Mz,Mx*My*Mz,(double)(Lx/Mx),(double)(Ly/My),(double)(1000./(Mz-1))));
598     }
599   }
600   PetscCall(THIInitializePrm(thi,da2prm,X));
601   if (thi->tridiagonal) {       /* Reset coarse Jacobian evaluation */
602     PetscCall(DMDASNESSetJacobianLocal(dm,(DMDASNESJacobian)THIJacobianLocal_3D_Full,thi));
603   }
604   if (thi->coarse2d) {
605     PetscCall(DMDASNESSetJacobianLocal(dm,(DMDASNESJacobian)THIJacobianLocal_2D,thi));
606   }
607   PetscCall(PetscObjectCompose((PetscObject)dm,"DMDA2Prm",(PetscObject)da2prm));
608   PetscCall(PetscObjectCompose((PetscObject)dm,"DMDA2Prm_Vec",(PetscObject)X));
609   PetscCall(DMDestroy(&da2prm));
610   PetscCall(VecDestroy(&X));
611   PetscFunctionReturn(0);
612 }
613 
614 static PetscErrorCode DMCoarsenHook_THI(DM dmf,DM dmc,void *ctx)
615 {
616   THI            thi = (THI)ctx;
617   PetscInt       rlevel,clevel;
618 
619   PetscFunctionBeginUser;
620   PetscCall(THISetUpDM(thi,dmc));
621   PetscCall(DMGetRefineLevel(dmc,&rlevel));
622   PetscCall(DMGetCoarsenLevel(dmc,&clevel));
623   if (rlevel-clevel == 0) PetscCall(DMSetMatType(dmc,MATAIJ));
624   PetscCall(DMCoarsenHookAdd(dmc,DMCoarsenHook_THI,NULL,thi));
625   PetscFunctionReturn(0);
626 }
627 
628 static PetscErrorCode DMRefineHook_THI(DM dmc,DM dmf,void *ctx)
629 {
630   THI            thi = (THI)ctx;
631 
632   PetscFunctionBeginUser;
633   PetscCall(THISetUpDM(thi,dmf));
634   PetscCall(DMSetMatType(dmf,thi->mattype));
635   PetscCall(DMRefineHookAdd(dmf,DMRefineHook_THI,NULL,thi));
636   /* With grid sequencing, a formerly-refined DM will later be coarsened by PCSetUp_MG */
637   PetscCall(DMCoarsenHookAdd(dmf,DMCoarsenHook_THI,NULL,thi));
638   PetscFunctionReturn(0);
639 }
640 
641 static PetscErrorCode THIDAGetPrm(DM da,PrmNode ***prm)
642 {
643   DM             da2prm;
644   Vec            X;
645 
646   PetscFunctionBeginUser;
647   PetscCall(PetscObjectQuery((PetscObject)da,"DMDA2Prm",(PetscObject*)&da2prm));
648   PetscCheck(da2prm,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm composed with given DMDA");
649   PetscCall(PetscObjectQuery((PetscObject)da,"DMDA2Prm_Vec",(PetscObject*)&X));
650   PetscCheck(X,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm_Vec composed with given DMDA");
651   PetscCall(DMDAVecGetArray(da2prm,X,prm));
652   PetscFunctionReturn(0);
653 }
654 
655 static PetscErrorCode THIDARestorePrm(DM da,PrmNode ***prm)
656 {
657   DM             da2prm;
658   Vec            X;
659 
660   PetscFunctionBeginUser;
661   PetscCall(PetscObjectQuery((PetscObject)da,"DMDA2Prm",(PetscObject*)&da2prm));
662   PetscCheck(da2prm,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm composed with given DMDA");
663   PetscCall(PetscObjectQuery((PetscObject)da,"DMDA2Prm_Vec",(PetscObject*)&X));
664   PetscCheck(X,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm_Vec composed with given DMDA");
665   PetscCall(DMDAVecRestoreArray(da2prm,X,prm));
666   PetscFunctionReturn(0);
667 }
668 
669 static PetscErrorCode THIInitial(SNES snes,Vec X,void *ctx)
670 {
671   THI            thi;
672   PetscInt       i,j,k,xs,xm,ys,ym,zs,zm,mx,my;
673   PetscReal      hx,hy;
674   PrmNode        **prm;
675   Node           ***x;
676   DM             da;
677 
678   PetscFunctionBeginUser;
679   PetscCall(SNESGetDM(snes,&da));
680   PetscCall(DMGetApplicationContext(da,&thi));
681   PetscCall(DMDAGetInfo(da,0, 0,&my,&mx, 0,0,0, 0,0,0,0,0,0));
682   PetscCall(DMDAGetCorners(da,&zs,&ys,&xs,&zm,&ym,&xm));
683   PetscCall(DMDAVecGetArray(da,X,&x));
684   PetscCall(THIDAGetPrm(da,&prm));
685   hx   = thi->Lx / mx;
686   hy   = thi->Ly / my;
687   for (i=xs; i<xs+xm; i++) {
688     for (j=ys; j<ys+ym; j++) {
689       for (k=zs; k<zs+zm; k++) {
690         const PetscScalar zm1      = zm-1,
691                           drivingx = thi->rhog * (prm[i+1][j].b+prm[i+1][j].h - prm[i-1][j].b-prm[i-1][j].h) / (2*hx),
692                           drivingy = thi->rhog * (prm[i][j+1].b+prm[i][j+1].h - prm[i][j-1].b-prm[i][j-1].h) / (2*hy);
693         x[i][j][k].u = 0. * drivingx * prm[i][j].h*(PetscScalar)k/zm1;
694         x[i][j][k].v = 0. * drivingy * prm[i][j].h*(PetscScalar)k/zm1;
695       }
696     }
697   }
698   PetscCall(DMDAVecRestoreArray(da,X,&x));
699   PetscCall(THIDARestorePrm(da,&prm));
700   PetscFunctionReturn(0);
701 }
702 
703 static void PointwiseNonlinearity(THI thi,const Node n[PETSC_RESTRICT],const PetscReal phi[PETSC_RESTRICT],PetscReal dphi[PETSC_RESTRICT][3],PetscScalar *PETSC_RESTRICT u,PetscScalar *PETSC_RESTRICT v,PetscScalar du[PETSC_RESTRICT],PetscScalar dv[PETSC_RESTRICT],PetscReal *eta,PetscReal *deta)
704 {
705   PetscInt    l,ll;
706   PetscScalar gam;
707 
708   du[0] = du[1] = du[2] = 0;
709   dv[0] = dv[1] = dv[2] = 0;
710   *u    = 0;
711   *v    = 0;
712   for (l=0; l<8; l++) {
713     *u += phi[l] * n[l].u;
714     *v += phi[l] * n[l].v;
715     for (ll=0; ll<3; ll++) {
716       du[ll] += dphi[l][ll] * n[l].u;
717       dv[ll] += dphi[l][ll] * n[l].v;
718     }
719   }
720   gam = PetscSqr(du[0]) + PetscSqr(dv[1]) + du[0]*dv[1] + 0.25*PetscSqr(du[1]+dv[0]) + 0.25*PetscSqr(du[2]) + 0.25*PetscSqr(dv[2]);
721   THIViscosity(thi,PetscRealPart(gam),eta,deta);
722 }
723 
724 static void PointwiseNonlinearity2D(THI thi,Node n[],PetscReal phi[],PetscReal dphi[4][2],PetscScalar *u,PetscScalar *v,PetscScalar du[],PetscScalar dv[],PetscReal *eta,PetscReal *deta)
725 {
726   PetscInt    l,ll;
727   PetscScalar gam;
728 
729   du[0] = du[1] = 0;
730   dv[0] = dv[1] = 0;
731   *u    = 0;
732   *v    = 0;
733   for (l=0; l<4; l++) {
734     *u += phi[l] * n[l].u;
735     *v += phi[l] * n[l].v;
736     for (ll=0; ll<2; ll++) {
737       du[ll] += dphi[l][ll] * n[l].u;
738       dv[ll] += dphi[l][ll] * n[l].v;
739     }
740   }
741   gam = PetscSqr(du[0]) + PetscSqr(dv[1]) + du[0]*dv[1] + 0.25*PetscSqr(du[1]+dv[0]);
742   THIViscosity(thi,PetscRealPart(gam),eta,deta);
743 }
744 
745 static PetscErrorCode THIFunctionLocal(DMDALocalInfo *info,Node ***x,Node ***f,THI thi)
746 {
747   PetscInt       xs,ys,xm,ym,zm,i,j,k,q,l;
748   PetscReal      hx,hy,etamin,etamax,beta2min,beta2max;
749   PrmNode        **prm;
750 
751   PetscFunctionBeginUser;
752   xs = info->zs;
753   ys = info->ys;
754   xm = info->zm;
755   ym = info->ym;
756   zm = info->xm;
757   hx = thi->Lx / info->mz;
758   hy = thi->Ly / info->my;
759 
760   etamin   = 1e100;
761   etamax   = 0;
762   beta2min = 1e100;
763   beta2max = 0;
764 
765   PetscCall(THIDAGetPrm(info->da,&prm));
766 
767   for (i=xs; i<xs+xm; i++) {
768     for (j=ys; j<ys+ym; j++) {
769       PrmNode pn[4];
770       QuadExtract(prm,i,j,pn);
771       for (k=0; k<zm-1; k++) {
772         PetscInt  ls = 0;
773         Node      n[8],*fn[8];
774         PetscReal zn[8],etabase = 0;
775         PrmHexGetZ(pn,k,zm,zn);
776         HexExtract(x,i,j,k,n);
777         HexExtractRef(f,i,j,k,fn);
778         if (thi->no_slip && k == 0) {
779           for (l=0; l<4; l++) n[l].u = n[l].v = 0;
780           /* The first 4 basis functions lie on the bottom layer, so their contribution is exactly 0, hence we can skip them */
781           ls = 4;
782         }
783         for (q=0; q<8; q++) {
784           PetscReal   dz[3],phi[8],dphi[8][3],jw,eta,deta;
785           PetscScalar du[3],dv[3],u,v;
786           HexGrad(HexQDeriv[q],zn,dz);
787           HexComputeGeometry(q,hx,hy,dz,phi,dphi,&jw);
788           PointwiseNonlinearity(thi,n,phi,dphi,&u,&v,du,dv,&eta,&deta);
789           jw /= thi->rhog;      /* scales residuals to be O(1) */
790           if (q == 0) etabase = eta;
791           RangeUpdate(&etamin,&etamax,eta);
792           for (l=ls; l<8; l++) { /* test functions */
793             const PetscReal ds[2] = {-PetscSinReal(thi->alpha),0};
794             const PetscReal pp    = phi[l],*dp = dphi[l];
795             fn[l]->u += dp[0]*jw*eta*(4.*du[0]+2.*dv[1]) + dp[1]*jw*eta*(du[1]+dv[0]) + dp[2]*jw*eta*du[2] + pp*jw*thi->rhog*ds[0];
796             fn[l]->v += dp[1]*jw*eta*(2.*du[0]+4.*dv[1]) + dp[0]*jw*eta*(du[1]+dv[0]) + dp[2]*jw*eta*dv[2] + pp*jw*thi->rhog*ds[1];
797           }
798         }
799         if (k == 0) { /* we are on a bottom face */
800           if (thi->no_slip) {
801             /* Note: Non-Galerkin coarse grid operators are very sensitive to the scaling of Dirichlet boundary
802             * conditions.  After shenanigans above, etabase contains the effective viscosity at the closest quadrature
803             * point to the bed.  We want the diagonal entry in the Dirichlet condition to have similar magnitude to the
804             * diagonal entry corresponding to the adjacent node.  The fundamental scaling of the viscous part is in
805             * diagu, diagv below.  This scaling is easy to recognize by considering the finite difference operator after
806             * scaling by element size.  The no-slip Dirichlet condition is scaled by this factor, and also in the
807             * assembled matrix (see the similar block in THIJacobianLocal).
808             *
809             * Note that the residual at this Dirichlet node is linear in the state at this node, but also depends
810             * (nonlinearly in general) on the neighboring interior nodes through the local viscosity.  This will make
811             * a matrix-free Jacobian have extra entries in the corresponding row.  We assemble only the diagonal part,
812             * so the solution will exactly satisfy the boundary condition after the first linear iteration.
813             */
814             const PetscReal   hz    = PetscRealPart(pn[0].h)/(zm-1.);
815             const PetscScalar diagu = 2*etabase/thi->rhog*(hx*hy/hz + hx*hz/hy + 4*hy*hz/hx),diagv = 2*etabase/thi->rhog*(hx*hy/hz + 4*hx*hz/hy + hy*hz/hx);
816             fn[0]->u = thi->dirichlet_scale*diagu*x[i][j][k].u;
817             fn[0]->v = thi->dirichlet_scale*diagv*x[i][j][k].v;
818           } else {              /* Integrate over bottom face to apply boundary condition */
819             for (q=0; q<4; q++) {
820               const PetscReal jw = 0.25*hx*hy/thi->rhog,*phi = QuadQInterp[q];
821               PetscScalar     u  =0,v=0,rbeta2=0;
822               PetscReal       beta2,dbeta2;
823               for (l=0; l<4; l++) {
824                 u      += phi[l]*n[l].u;
825                 v      += phi[l]*n[l].v;
826                 rbeta2 += phi[l]*pn[l].beta2;
827               }
828               THIFriction(thi,PetscRealPart(rbeta2),PetscRealPart(u*u+v*v)/2,&beta2,&dbeta2);
829               RangeUpdate(&beta2min,&beta2max,beta2);
830               for (l=0; l<4; l++) {
831                 const PetscReal pp = phi[l];
832                 fn[ls+l]->u += pp*jw*beta2*u;
833                 fn[ls+l]->v += pp*jw*beta2*v;
834               }
835             }
836           }
837         }
838       }
839     }
840   }
841 
842   PetscCall(THIDARestorePrm(info->da,&prm));
843 
844   PetscCall(PRangeMinMax(&thi->eta,etamin,etamax));
845   PetscCall(PRangeMinMax(&thi->beta2,beta2min,beta2max));
846   PetscFunctionReturn(0);
847 }
848 
849 static PetscErrorCode THIMatrixStatistics(THI thi,Mat B,PetscViewer viewer)
850 {
851   PetscReal      nrm;
852   PetscInt       m;
853   PetscMPIInt    rank;
854 
855   PetscFunctionBeginUser;
856   PetscCall(MatNorm(B,NORM_FROBENIUS,&nrm));
857   PetscCall(MatGetSize(B,&m,0));
858   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B),&rank));
859   if (rank == 0) {
860     PetscScalar val0,val2;
861     PetscCall(MatGetValue(B,0,0,&val0));
862     PetscCall(MatGetValue(B,2,2,&val2));
863     PetscCall(PetscViewerASCIIPrintf(viewer,"Matrix dim %" PetscInt_FMT " norm %8.2e (0,0) %8.2e  (2,2) %8.2e %8.2e <= eta <= %8.2e %8.2e <= beta2 <= %8.2e\n",m,(double)nrm,(double)PetscRealPart(val0),(double)PetscRealPart(val2),(double)thi->eta.cmin,(double)thi->eta.cmax,(double)thi->beta2.cmin,(double)thi->beta2.cmax));
864   }
865   PetscFunctionReturn(0);
866 }
867 
868 static PetscErrorCode THISurfaceStatistics(DM da,Vec X,PetscReal *min,PetscReal *max,PetscReal *mean)
869 {
870   Node           ***x;
871   PetscInt       i,j,xs,ys,zs,xm,ym,zm,mx,my,mz;
872   PetscReal      umin = 1e100,umax=-1e100;
873   PetscScalar    usum = 0.0,gusum;
874 
875   PetscFunctionBeginUser;
876   *min = *max = *mean = 0;
877   PetscCall(DMDAGetInfo(da,0, &mz,&my,&mx, 0,0,0, 0,0,0,0,0,0));
878   PetscCall(DMDAGetCorners(da,&zs,&ys,&xs,&zm,&ym,&xm));
879   PetscCheck(zs == 0 && zm == mz,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected decomposition");
880   PetscCall(DMDAVecGetArray(da,X,&x));
881   for (i=xs; i<xs+xm; i++) {
882     for (j=ys; j<ys+ym; j++) {
883       PetscReal u = PetscRealPart(x[i][j][zm-1].u);
884       RangeUpdate(&umin,&umax,u);
885       usum += u;
886     }
887   }
888   PetscCall(DMDAVecRestoreArray(da,X,&x));
889   PetscCallMPI(MPI_Allreduce(&umin,min,1,MPIU_REAL,MPIU_MIN,PetscObjectComm((PetscObject)da)));
890   PetscCallMPI(MPI_Allreduce(&umax,max,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)da)));
891   PetscCallMPI(MPI_Allreduce(&usum,&gusum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)da)));
892   *mean = PetscRealPart(gusum) / (mx*my);
893   PetscFunctionReturn(0);
894 }
895 
896 static PetscErrorCode THISolveStatistics(THI thi,SNES snes,PetscInt coarsened,const char name[])
897 {
898   MPI_Comm       comm;
899   Vec            X;
900   DM             dm;
901 
902   PetscFunctionBeginUser;
903   PetscCall(PetscObjectGetComm((PetscObject)thi,&comm));
904   PetscCall(SNESGetSolution(snes,&X));
905   PetscCall(SNESGetDM(snes,&dm));
906   PetscCall(PetscPrintf(comm,"Solution statistics after solve: %s\n",name));
907   {
908     PetscInt            its,lits;
909     SNESConvergedReason reason;
910     PetscCall(SNESGetIterationNumber(snes,&its));
911     PetscCall(SNESGetConvergedReason(snes,&reason));
912     PetscCall(SNESGetLinearSolveIterations(snes,&lits));
913     PetscCall(PetscPrintf(comm,"%s: Number of SNES iterations = %" PetscInt_FMT ", total linear iterations = %" PetscInt_FMT "\n",SNESConvergedReasons[reason],its,lits));
914   }
915   {
916     PetscReal         nrm2,tmin[3]={1e100,1e100,1e100},tmax[3]={-1e100,-1e100,-1e100},min[3],max[3];
917     PetscInt          i,j,m;
918     const PetscScalar *x;
919     PetscCall(VecNorm(X,NORM_2,&nrm2));
920     PetscCall(VecGetLocalSize(X,&m));
921     PetscCall(VecGetArrayRead(X,&x));
922     for (i=0; i<m; i+=2) {
923       PetscReal u = PetscRealPart(x[i]),v = PetscRealPart(x[i+1]),c = PetscSqrtReal(u*u+v*v);
924       tmin[0] = PetscMin(u,tmin[0]);
925       tmin[1] = PetscMin(v,tmin[1]);
926       tmin[2] = PetscMin(c,tmin[2]);
927       tmax[0] = PetscMax(u,tmax[0]);
928       tmax[1] = PetscMax(v,tmax[1]);
929       tmax[2] = PetscMax(c,tmax[2]);
930     }
931     PetscCall(VecRestoreArrayRead(X,&x));
932     PetscCallMPI(MPI_Allreduce(tmin,min,3,MPIU_REAL,MPIU_MIN,PetscObjectComm((PetscObject)thi)));
933     PetscCallMPI(MPI_Allreduce(tmax,max,3,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)thi)));
934     /* Dimensionalize to meters/year */
935     nrm2 *= thi->units->year / thi->units->meter;
936     for (j=0; j<3; j++) {
937       min[j] *= thi->units->year / thi->units->meter;
938       max[j] *= thi->units->year / thi->units->meter;
939     }
940     if (min[0] == 0.0) min[0] = 0.0;
941     PetscCall(PetscPrintf(comm,"|X|_2 %g   %g <= u <=  %g   %g <= v <=  %g   %g <= c <=  %g \n",(double)nrm2,(double)min[0],(double)max[0],(double)min[1],(double)max[1],(double)min[2],(double)max[2]));
942     {
943       PetscReal umin,umax,umean;
944       PetscCall(THISurfaceStatistics(dm,X,&umin,&umax,&umean));
945       umin  *= thi->units->year / thi->units->meter;
946       umax  *= thi->units->year / thi->units->meter;
947       umean *= thi->units->year / thi->units->meter;
948       PetscCall(PetscPrintf(comm,"Surface statistics: u in [%12.6e, %12.6e] mean %12.6e\n",(double)umin,(double)umax,(double)umean));
949     }
950     /* These values stay nondimensional */
951     PetscCall(PetscPrintf(comm,"Global eta range   %g to %g converged range %g to %g\n",(double)thi->eta.min,(double)thi->eta.max,(double)thi->eta.cmin,(double)thi->eta.cmax));
952     PetscCall(PetscPrintf(comm,"Global beta2 range %g to %g converged range %g to %g\n",(double)thi->beta2.min,(double)thi->beta2.max,(double)thi->beta2.cmin,(double)thi->beta2.cmax));
953   }
954   PetscFunctionReturn(0);
955 }
956 
957 static PetscErrorCode THIJacobianLocal_2D(DMDALocalInfo *info,Node **x,Mat J,Mat B,THI thi)
958 {
959   PetscInt       xs,ys,xm,ym,i,j,q,l,ll;
960   PetscReal      hx,hy;
961   PrmNode        **prm;
962 
963   PetscFunctionBeginUser;
964   xs = info->ys;
965   ys = info->xs;
966   xm = info->ym;
967   ym = info->xm;
968   hx = thi->Lx / info->my;
969   hy = thi->Ly / info->mx;
970 
971   PetscCall(MatZeroEntries(B));
972   PetscCall(THIDAGetPrm(info->da,&prm));
973 
974   for (i=xs; i<xs+xm; i++) {
975     for (j=ys; j<ys+ym; j++) {
976       Node        n[4];
977       PrmNode     pn[4];
978       PetscScalar Ke[4*2][4*2];
979       QuadExtract(prm,i,j,pn);
980       QuadExtract(x,i,j,n);
981       PetscCall(PetscMemzero(Ke,sizeof(Ke)));
982       for (q=0; q<4; q++) {
983         PetscReal   phi[4],dphi[4][2],jw,eta,deta,beta2,dbeta2;
984         PetscScalar u,v,du[2],dv[2],h = 0,rbeta2 = 0;
985         for (l=0; l<4; l++) {
986           phi[l]     = QuadQInterp[q][l];
987           dphi[l][0] = QuadQDeriv[q][l][0]*2./hx;
988           dphi[l][1] = QuadQDeriv[q][l][1]*2./hy;
989           h         += phi[l] * pn[l].h;
990           rbeta2    += phi[l] * pn[l].beta2;
991         }
992         jw = 0.25*hx*hy / thi->rhog; /* rhog is only scaling */
993         PointwiseNonlinearity2D(thi,n,phi,dphi,&u,&v,du,dv,&eta,&deta);
994         THIFriction(thi,PetscRealPart(rbeta2),PetscRealPart(u*u+v*v)/2,&beta2,&dbeta2);
995         for (l=0; l<4; l++) {
996           const PetscReal pp = phi[l],*dp = dphi[l];
997           for (ll=0; ll<4; ll++) {
998             const PetscReal ppl = phi[ll],*dpl = dphi[ll];
999             PetscScalar     dgdu,dgdv;
1000             dgdu = 2.*du[0]*dpl[0] + dv[1]*dpl[0] + 0.5*(du[1]+dv[0])*dpl[1];
1001             dgdv = 2.*dv[1]*dpl[1] + du[0]*dpl[1] + 0.5*(du[1]+dv[0])*dpl[0];
1002             /* Picard part */
1003             Ke[l*2+0][ll*2+0] += dp[0]*jw*eta*4.*dpl[0] + dp[1]*jw*eta*dpl[1] + pp*jw*(beta2/h)*ppl*thi->ssa_friction_scale;
1004             Ke[l*2+0][ll*2+1] += dp[0]*jw*eta*2.*dpl[1] + dp[1]*jw*eta*dpl[0];
1005             Ke[l*2+1][ll*2+0] += dp[1]*jw*eta*2.*dpl[0] + dp[0]*jw*eta*dpl[1];
1006             Ke[l*2+1][ll*2+1] += dp[1]*jw*eta*4.*dpl[1] + dp[0]*jw*eta*dpl[0] + pp*jw*(beta2/h)*ppl*thi->ssa_friction_scale;
1007             /* extra Newton terms */
1008             Ke[l*2+0][ll*2+0] += dp[0]*jw*deta*dgdu*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdu*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*u*u*ppl*thi->ssa_friction_scale;
1009             Ke[l*2+0][ll*2+1] += dp[0]*jw*deta*dgdv*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdv*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*u*v*ppl*thi->ssa_friction_scale;
1010             Ke[l*2+1][ll*2+0] += dp[1]*jw*deta*dgdu*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdu*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*v*u*ppl*thi->ssa_friction_scale;
1011             Ke[l*2+1][ll*2+1] += dp[1]*jw*deta*dgdv*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdv*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*v*v*ppl*thi->ssa_friction_scale;
1012           }
1013         }
1014       }
1015       {
1016         const MatStencil rc[4] = {{0,i,j,0},{0,i+1,j,0},{0,i+1,j+1,0},{0,i,j+1,0}};
1017         PetscCall(MatSetValuesBlockedStencil(B,4,rc,4,rc,&Ke[0][0],ADD_VALUES));
1018       }
1019     }
1020   }
1021   PetscCall(THIDARestorePrm(info->da,&prm));
1022 
1023   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1024   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1025   PetscCall(MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE));
1026   if (thi->verbose) PetscCall(THIMatrixStatistics(thi,B,PETSC_VIEWER_STDOUT_WORLD));
1027   PetscFunctionReturn(0);
1028 }
1029 
1030 static PetscErrorCode THIJacobianLocal_3D(DMDALocalInfo *info,Node ***x,Mat B,THI thi,THIAssemblyMode amode)
1031 {
1032   PetscInt       xs,ys,xm,ym,zm,i,j,k,q,l,ll;
1033   PetscReal      hx,hy;
1034   PrmNode        **prm;
1035 
1036   PetscFunctionBeginUser;
1037   xs = info->zs;
1038   ys = info->ys;
1039   xm = info->zm;
1040   ym = info->ym;
1041   zm = info->xm;
1042   hx = thi->Lx / info->mz;
1043   hy = thi->Ly / info->my;
1044 
1045   PetscCall(MatZeroEntries(B));
1046   PetscCall(MatSetOption(B,MAT_SUBSET_OFF_PROC_ENTRIES,PETSC_TRUE));
1047   PetscCall(THIDAGetPrm(info->da,&prm));
1048 
1049   for (i=xs; i<xs+xm; i++) {
1050     for (j=ys; j<ys+ym; j++) {
1051       PrmNode pn[4];
1052       QuadExtract(prm,i,j,pn);
1053       for (k=0; k<zm-1; k++) {
1054         Node        n[8];
1055         PetscReal   zn[8],etabase = 0;
1056         PetscScalar Ke[8*2][8*2];
1057         PetscInt    ls = 0;
1058 
1059         PrmHexGetZ(pn,k,zm,zn);
1060         HexExtract(x,i,j,k,n);
1061         PetscCall(PetscMemzero(Ke,sizeof(Ke)));
1062         if (thi->no_slip && k == 0) {
1063           for (l=0; l<4; l++) n[l].u = n[l].v = 0;
1064           ls = 4;
1065         }
1066         for (q=0; q<8; q++) {
1067           PetscReal   dz[3],phi[8],dphi[8][3],jw,eta,deta;
1068           PetscScalar du[3],dv[3],u,v;
1069           HexGrad(HexQDeriv[q],zn,dz);
1070           HexComputeGeometry(q,hx,hy,dz,phi,dphi,&jw);
1071           PointwiseNonlinearity(thi,n,phi,dphi,&u,&v,du,dv,&eta,&deta);
1072           jw /= thi->rhog;      /* residuals are scaled by this factor */
1073           if (q == 0) etabase = eta;
1074           for (l=ls; l<8; l++) { /* test functions */
1075             const PetscReal *PETSC_RESTRICT dp = dphi[l];
1076 #if USE_SSE2_KERNELS
1077             /* gcc (up to my 4.5 snapshot) is really bad at hoisting intrinsics so we do it manually */
1078             __m128d
1079               p4         = _mm_set1_pd(4),p2 = _mm_set1_pd(2),p05 = _mm_set1_pd(0.5),
1080               p42        = _mm_setr_pd(4,2),p24 = _mm_shuffle_pd(p42,p42,_MM_SHUFFLE2(0,1)),
1081               du0        = _mm_set1_pd(du[0]),du1 = _mm_set1_pd(du[1]),du2 = _mm_set1_pd(du[2]),
1082               dv0        = _mm_set1_pd(dv[0]),dv1 = _mm_set1_pd(dv[1]),dv2 = _mm_set1_pd(dv[2]),
1083               jweta      = _mm_set1_pd(jw*eta),jwdeta = _mm_set1_pd(jw*deta),
1084               dp0        = _mm_set1_pd(dp[0]),dp1 = _mm_set1_pd(dp[1]),dp2 = _mm_set1_pd(dp[2]),
1085               dp0jweta   = _mm_mul_pd(dp0,jweta),dp1jweta = _mm_mul_pd(dp1,jweta),dp2jweta = _mm_mul_pd(dp2,jweta),
1086               p4du0p2dv1 = _mm_add_pd(_mm_mul_pd(p4,du0),_mm_mul_pd(p2,dv1)), /* 4 du0 + 2 dv1 */
1087               p4dv1p2du0 = _mm_add_pd(_mm_mul_pd(p4,dv1),_mm_mul_pd(p2,du0)), /* 4 dv1 + 2 du0 */
1088               pdu2dv2    = _mm_unpacklo_pd(du2,dv2),                          /* [du2, dv2] */
1089               du1pdv0    = _mm_add_pd(du1,dv0),                               /* du1 + dv0 */
1090               t1         = _mm_mul_pd(dp0,p4du0p2dv1),                        /* dp0 (4 du0 + 2 dv1) */
1091               t2         = _mm_mul_pd(dp1,p4dv1p2du0);                        /* dp1 (4 dv1 + 2 du0) */
1092 
1093 #endif
1094 #if defined COMPUTE_LOWER_TRIANGULAR  /* The element matrices are always symmetric so computing the lower-triangular part is not necessary */
1095             for (ll=ls; ll<8; ll++) { /* trial functions */
1096 #else
1097             for (ll=l; ll<8; ll++) {
1098 #endif
1099               const PetscReal *PETSC_RESTRICT dpl = dphi[ll];
1100               if (amode == THIASSEMBLY_TRIDIAGONAL && (l-ll)%4) continue; /* these entries would not be inserted */
1101 #if !USE_SSE2_KERNELS
1102               /* The analytic Jacobian in nice, easy-to-read form */
1103               {
1104                 PetscScalar dgdu,dgdv;
1105                 dgdu = 2.*du[0]*dpl[0] + dv[1]*dpl[0] + 0.5*(du[1]+dv[0])*dpl[1] + 0.5*du[2]*dpl[2];
1106                 dgdv = 2.*dv[1]*dpl[1] + du[0]*dpl[1] + 0.5*(du[1]+dv[0])*dpl[0] + 0.5*dv[2]*dpl[2];
1107                 /* Picard part */
1108                 Ke[l*2+0][ll*2+0] += dp[0]*jw*eta*4.*dpl[0] + dp[1]*jw*eta*dpl[1] + dp[2]*jw*eta*dpl[2];
1109                 Ke[l*2+0][ll*2+1] += dp[0]*jw*eta*2.*dpl[1] + dp[1]*jw*eta*dpl[0];
1110                 Ke[l*2+1][ll*2+0] += dp[1]*jw*eta*2.*dpl[0] + dp[0]*jw*eta*dpl[1];
1111                 Ke[l*2+1][ll*2+1] += dp[1]*jw*eta*4.*dpl[1] + dp[0]*jw*eta*dpl[0] + dp[2]*jw*eta*dpl[2];
1112                 /* extra Newton terms */
1113                 Ke[l*2+0][ll*2+0] += dp[0]*jw*deta*dgdu*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdu*(du[1]+dv[0]) + dp[2]*jw*deta*dgdu*du[2];
1114                 Ke[l*2+0][ll*2+1] += dp[0]*jw*deta*dgdv*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdv*(du[1]+dv[0]) + dp[2]*jw*deta*dgdv*du[2];
1115                 Ke[l*2+1][ll*2+0] += dp[1]*jw*deta*dgdu*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdu*(du[1]+dv[0]) + dp[2]*jw*deta*dgdu*dv[2];
1116                 Ke[l*2+1][ll*2+1] += dp[1]*jw*deta*dgdv*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdv*(du[1]+dv[0]) + dp[2]*jw*deta*dgdv*dv[2];
1117               }
1118 #else
1119               /* This SSE2 code is an exact replica of above, but uses explicit packed instructions for some speed
1120               * benefit.  On my hardware, these intrinsics are almost twice as fast as above, reducing total assembly cost
1121               * by 25 to 30 percent. */
1122               {
1123                 __m128d
1124                   keu   = _mm_loadu_pd(&Ke[l*2+0][ll*2+0]),
1125                   kev   = _mm_loadu_pd(&Ke[l*2+1][ll*2+0]),
1126                   dpl01 = _mm_loadu_pd(&dpl[0]),dpl10 = _mm_shuffle_pd(dpl01,dpl01,_MM_SHUFFLE2(0,1)),dpl2 = _mm_set_sd(dpl[2]),
1127                   t0,t3,pdgduv;
1128                 keu = _mm_add_pd(keu,_mm_add_pd(_mm_mul_pd(_mm_mul_pd(dp0jweta,p42),dpl01),
1129                                                 _mm_add_pd(_mm_mul_pd(dp1jweta,dpl10),
1130                                                            _mm_mul_pd(dp2jweta,dpl2))));
1131                 kev = _mm_add_pd(kev,_mm_add_pd(_mm_mul_pd(_mm_mul_pd(dp1jweta,p24),dpl01),
1132                                                 _mm_add_pd(_mm_mul_pd(dp0jweta,dpl10),
1133                                                            _mm_mul_pd(dp2jweta,_mm_shuffle_pd(dpl2,dpl2,_MM_SHUFFLE2(0,1))))));
1134                 pdgduv = _mm_mul_pd(p05,_mm_add_pd(_mm_add_pd(_mm_mul_pd(p42,_mm_mul_pd(du0,dpl01)),
1135                                                               _mm_mul_pd(p24,_mm_mul_pd(dv1,dpl01))),
1136                                                    _mm_add_pd(_mm_mul_pd(du1pdv0,dpl10),
1137                                                               _mm_mul_pd(pdu2dv2,_mm_set1_pd(dpl[2]))))); /* [dgdu, dgdv] */
1138                 t0 = _mm_mul_pd(jwdeta,pdgduv);  /* jw deta [dgdu, dgdv] */
1139                 t3 = _mm_mul_pd(t0,du1pdv0);     /* t0 (du1 + dv0) */
1140                 _mm_storeu_pd(&Ke[l*2+0][ll*2+0],_mm_add_pd(keu,_mm_add_pd(_mm_mul_pd(t1,t0),
1141                                                                            _mm_add_pd(_mm_mul_pd(dp1,t3),
1142                                                                                       _mm_mul_pd(t0,_mm_mul_pd(dp2,du2))))));
1143                 _mm_storeu_pd(&Ke[l*2+1][ll*2+0],_mm_add_pd(kev,_mm_add_pd(_mm_mul_pd(t2,t0),
1144                                                                            _mm_add_pd(_mm_mul_pd(dp0,t3),
1145                                                                                       _mm_mul_pd(t0,_mm_mul_pd(dp2,dv2))))));
1146               }
1147 #endif
1148             }
1149           }
1150         }
1151         if (k == 0) { /* on a bottom face */
1152           if (thi->no_slip) {
1153             const PetscReal   hz    = PetscRealPart(pn[0].h)/(zm-1);
1154             const PetscScalar diagu = 2*etabase/thi->rhog*(hx*hy/hz + hx*hz/hy + 4*hy*hz/hx),diagv = 2*etabase/thi->rhog*(hx*hy/hz + 4*hx*hz/hy + hy*hz/hx);
1155             Ke[0][0] = thi->dirichlet_scale*diagu;
1156             Ke[1][1] = thi->dirichlet_scale*diagv;
1157           } else {
1158             for (q=0; q<4; q++) {
1159               const PetscReal jw = 0.25*hx*hy/thi->rhog,*phi = QuadQInterp[q];
1160               PetscScalar     u  =0,v=0,rbeta2=0;
1161               PetscReal       beta2,dbeta2;
1162               for (l=0; l<4; l++) {
1163                 u      += phi[l]*n[l].u;
1164                 v      += phi[l]*n[l].v;
1165                 rbeta2 += phi[l]*pn[l].beta2;
1166               }
1167               THIFriction(thi,PetscRealPart(rbeta2),PetscRealPart(u*u+v*v)/2,&beta2,&dbeta2);
1168               for (l=0; l<4; l++) {
1169                 const PetscReal pp = phi[l];
1170                 for (ll=0; ll<4; ll++) {
1171                   const PetscReal ppl = phi[ll];
1172                   Ke[l*2+0][ll*2+0] += pp*jw*beta2*ppl + pp*jw*dbeta2*u*u*ppl;
1173                   Ke[l*2+0][ll*2+1] +=                   pp*jw*dbeta2*u*v*ppl;
1174                   Ke[l*2+1][ll*2+0] +=                   pp*jw*dbeta2*v*u*ppl;
1175                   Ke[l*2+1][ll*2+1] += pp*jw*beta2*ppl + pp*jw*dbeta2*v*v*ppl;
1176                 }
1177               }
1178             }
1179           }
1180         }
1181         {
1182           const MatStencil rc[8] = {{i,j,k,0},{i+1,j,k,0},{i+1,j+1,k,0},{i,j+1,k,0},{i,j,k+1,0},{i+1,j,k+1,0},{i+1,j+1,k+1,0},{i,j+1,k+1,0}};
1183           if (amode == THIASSEMBLY_TRIDIAGONAL) {
1184             for (l=0; l<4; l++) { /* Copy out each of the blocks, discarding horizontal coupling */
1185               const PetscInt   l4     = l+4;
1186               const MatStencil rcl[2] = {{rc[l].k,rc[l].j,rc[l].i,0},{rc[l4].k,rc[l4].j,rc[l4].i,0}};
1187 #if defined COMPUTE_LOWER_TRIANGULAR
1188               const PetscScalar Kel[4][4] = {{Ke[2*l+0][2*l+0] ,Ke[2*l+0][2*l+1] ,Ke[2*l+0][2*l4+0] ,Ke[2*l+0][2*l4+1]},
1189                                              {Ke[2*l+1][2*l+0] ,Ke[2*l+1][2*l+1] ,Ke[2*l+1][2*l4+0] ,Ke[2*l+1][2*l4+1]},
1190                                              {Ke[2*l4+0][2*l+0],Ke[2*l4+0][2*l+1],Ke[2*l4+0][2*l4+0],Ke[2*l4+0][2*l4+1]},
1191                                              {Ke[2*l4+1][2*l+0],Ke[2*l4+1][2*l+1],Ke[2*l4+1][2*l4+0],Ke[2*l4+1][2*l4+1]}};
1192 #else
1193               /* Same as above except for the lower-left block */
1194               const PetscScalar Kel[4][4] = {{Ke[2*l+0][2*l+0] ,Ke[2*l+0][2*l+1] ,Ke[2*l+0][2*l4+0] ,Ke[2*l+0][2*l4+1]},
1195                                              {Ke[2*l+1][2*l+0] ,Ke[2*l+1][2*l+1] ,Ke[2*l+1][2*l4+0] ,Ke[2*l+1][2*l4+1]},
1196                                              {Ke[2*l+0][2*l4+0],Ke[2*l+1][2*l4+0],Ke[2*l4+0][2*l4+0],Ke[2*l4+0][2*l4+1]},
1197                                              {Ke[2*l+0][2*l4+1],Ke[2*l+1][2*l4+1],Ke[2*l4+1][2*l4+0],Ke[2*l4+1][2*l4+1]}};
1198 #endif
1199               PetscCall(MatSetValuesBlockedStencil(B,2,rcl,2,rcl,&Kel[0][0],ADD_VALUES));
1200             }
1201           } else {
1202 #if !defined COMPUTE_LOWER_TRIANGULAR /* fill in lower-triangular part, this is really cheap compared to computing the entries */
1203             for (l=0; l<8; l++) {
1204               for (ll=l+1; ll<8; ll++) {
1205                 Ke[ll*2+0][l*2+0] = Ke[l*2+0][ll*2+0];
1206                 Ke[ll*2+1][l*2+0] = Ke[l*2+0][ll*2+1];
1207                 Ke[ll*2+0][l*2+1] = Ke[l*2+1][ll*2+0];
1208                 Ke[ll*2+1][l*2+1] = Ke[l*2+1][ll*2+1];
1209               }
1210             }
1211 #endif
1212             PetscCall(MatSetValuesBlockedStencil(B,8,rc,8,rc,&Ke[0][0],ADD_VALUES));
1213           }
1214         }
1215       }
1216     }
1217   }
1218   PetscCall(THIDARestorePrm(info->da,&prm));
1219 
1220   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1221   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1222   PetscCall(MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE));
1223   if (thi->verbose) PetscCall(THIMatrixStatistics(thi,B,PETSC_VIEWER_STDOUT_WORLD));
1224   PetscFunctionReturn(0);
1225 }
1226 
1227 static PetscErrorCode THIJacobianLocal_3D_Full(DMDALocalInfo *info,Node ***x,Mat A,Mat B,THI thi)
1228 {
1229   PetscFunctionBeginUser;
1230   PetscCall(THIJacobianLocal_3D(info,x,B,thi,THIASSEMBLY_FULL));
1231   PetscFunctionReturn(0);
1232 }
1233 
1234 static PetscErrorCode THIJacobianLocal_3D_Tridiagonal(DMDALocalInfo *info,Node ***x,Mat A,Mat B,THI thi)
1235 {
1236   PetscFunctionBeginUser;
1237   PetscCall(THIJacobianLocal_3D(info,x,B,thi,THIASSEMBLY_TRIDIAGONAL));
1238   PetscFunctionReturn(0);
1239 }
1240 
1241 static PetscErrorCode DMRefineHierarchy_THI(DM dac0,PetscInt nlevels,DM hierarchy[])
1242 {
1243   THI             thi;
1244   PetscInt        dim,M,N,m,n,s,dof;
1245   DM              dac,daf;
1246   DMDAStencilType st;
1247   DM_DA           *ddf,*ddc;
1248 
1249   PetscFunctionBeginUser;
1250   PetscCall(PetscObjectQuery((PetscObject)dac0,"THI",(PetscObject*)&thi));
1251   PetscCheck(thi,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot refine this DMDA, missing composed THI instance");
1252   if (nlevels > 1) {
1253     PetscCall(DMRefineHierarchy(dac0,nlevels-1,hierarchy));
1254     dac  = hierarchy[nlevels-2];
1255   } else {
1256     dac = dac0;
1257   }
1258   PetscCall(DMDAGetInfo(dac,&dim, &N,&M,0, &n,&m,0, &dof,&s,0,0,0,&st));
1259   PetscCheck(dim == 2,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"This function can only refine 2D DMDAs");
1260 
1261   /* Creates a 3D DMDA with the same map-plane layout as the 2D one, with contiguous columns */
1262   PetscCall(DMDACreate3d(PetscObjectComm((PetscObject)dac),DM_BOUNDARY_NONE,DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC,st,thi->zlevels,N,M,1,n,m,dof,s,NULL,NULL,NULL,&daf));
1263   PetscCall(DMSetUp(daf));
1264 
1265   daf->ops->creatematrix        = dac->ops->creatematrix;
1266   daf->ops->createinterpolation = dac->ops->createinterpolation;
1267   daf->ops->getcoloring         = dac->ops->getcoloring;
1268   ddf                           = (DM_DA*)daf->data;
1269   ddc                           = (DM_DA*)dac->data;
1270   ddf->interptype               = ddc->interptype;
1271 
1272   PetscCall(DMDASetFieldName(daf,0,"x-velocity"));
1273   PetscCall(DMDASetFieldName(daf,1,"y-velocity"));
1274 
1275   hierarchy[nlevels-1] = daf;
1276   PetscFunctionReturn(0);
1277 }
1278 
1279 static PetscErrorCode DMCreateInterpolation_DA_THI(DM dac,DM daf,Mat *A,Vec *scale)
1280 {
1281   PetscInt       dim;
1282 
1283   PetscFunctionBeginUser;
1284   PetscValidHeaderSpecific(dac,DM_CLASSID,1);
1285   PetscValidHeaderSpecific(daf,DM_CLASSID,2);
1286   PetscValidPointer(A,3);
1287   if (scale) PetscValidPointer(scale,4);
1288   PetscCall(DMDAGetInfo(daf,&dim,0,0,0,0,0,0,0,0,0,0,0,0));
1289   if (dim  == 2) {
1290     /* We are in the 2D problem and use normal DMDA interpolation */
1291     PetscCall(DMCreateInterpolation(dac,daf,A,scale));
1292   } else {
1293     PetscInt i,j,k,xs,ys,zs,xm,ym,zm,mx,my,mz,rstart,cstart;
1294     Mat      B;
1295 
1296     PetscCall(DMDAGetInfo(daf,0, &mz,&my,&mx, 0,0,0, 0,0,0,0,0,0));
1297     PetscCall(DMDAGetCorners(daf,&zs,&ys,&xs,&zm,&ym,&xm));
1298     PetscCheck(!zs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"unexpected");
1299     PetscCall(MatCreate(PetscObjectComm((PetscObject)daf),&B));
1300     PetscCall(MatSetSizes(B,xm*ym*zm,xm*ym,mx*my*mz,mx*my));
1301 
1302     PetscCall(MatSetType(B,MATAIJ));
1303     PetscCall(MatSeqAIJSetPreallocation(B,1,NULL));
1304     PetscCall(MatMPIAIJSetPreallocation(B,1,NULL,0,NULL));
1305     PetscCall(MatGetOwnershipRange(B,&rstart,NULL));
1306     PetscCall(MatGetOwnershipRangeColumn(B,&cstart,NULL));
1307     for (i=xs; i<xs+xm; i++) {
1308       for (j=ys; j<ys+ym; j++) {
1309         for (k=zs; k<zs+zm; k++) {
1310           PetscInt    i2  = i*ym+j,i3 = i2*zm+k;
1311           PetscScalar val = ((k == 0 || k == mz-1) ? 0.5 : 1.) / (mz-1.); /* Integration using trapezoid rule */
1312           PetscCall(MatSetValue(B,cstart+i3,rstart+i2,val,INSERT_VALUES));
1313         }
1314       }
1315     }
1316     PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1317     PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1318     PetscCall(MatCreateMAIJ(B,sizeof(Node)/sizeof(PetscScalar),A));
1319     PetscCall(MatDestroy(&B));
1320   }
1321   PetscFunctionReturn(0);
1322 }
1323 
1324 static PetscErrorCode DMCreateMatrix_THI_Tridiagonal(DM da,Mat *J)
1325 {
1326   Mat                    A;
1327   PetscInt               xm,ym,zm,dim,dof = 2,starts[3],dims[3];
1328   ISLocalToGlobalMapping ltog;
1329 
1330   PetscFunctionBeginUser;
1331   PetscCall(DMDAGetInfo(da,&dim, 0,0,0, 0,0,0, 0,0,0,0,0,0));
1332   PetscCheck(dim == 3,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Expected DMDA to be 3D");
1333   PetscCall(DMDAGetCorners(da,0,0,0,&zm,&ym,&xm));
1334   PetscCall(DMGetLocalToGlobalMapping(da,&ltog));
1335   PetscCall(MatCreate(PetscObjectComm((PetscObject)da),&A));
1336   PetscCall(MatSetSizes(A,dof*xm*ym*zm,dof*xm*ym*zm,PETSC_DETERMINE,PETSC_DETERMINE));
1337   PetscCall(MatSetType(A,da->mattype));
1338   PetscCall(MatSetFromOptions(A));
1339   PetscCall(MatSeqAIJSetPreallocation(A,3*2,NULL));
1340   PetscCall(MatMPIAIJSetPreallocation(A,3*2,NULL,0,NULL));
1341   PetscCall(MatSeqBAIJSetPreallocation(A,2,3,NULL));
1342   PetscCall(MatMPIBAIJSetPreallocation(A,2,3,NULL,0,NULL));
1343   PetscCall(MatSeqSBAIJSetPreallocation(A,2,2,NULL));
1344   PetscCall(MatMPISBAIJSetPreallocation(A,2,2,NULL,0,NULL));
1345   PetscCall(MatSetLocalToGlobalMapping(A,ltog,ltog));
1346   PetscCall(DMDAGetGhostCorners(da,&starts[0],&starts[1],&starts[2],&dims[0],&dims[1],&dims[2]));
1347   PetscCall(MatSetStencil(A,dim,dims,starts,dof));
1348   *J   = A;
1349   PetscFunctionReturn(0);
1350 }
1351 
1352 static PetscErrorCode THIDAVecView_VTK_XML(THI thi,DM da,Vec X,const char filename[])
1353 {
1354   const PetscInt    dof   = 2;
1355   Units             units = thi->units;
1356   MPI_Comm          comm;
1357   PetscViewer       viewer;
1358   PetscMPIInt       rank,size,tag,nn,nmax;
1359   PetscInt          mx,my,mz,r,range[6];
1360   const PetscScalar *x;
1361 
1362   PetscFunctionBeginUser;
1363   PetscCall(PetscObjectGetComm((PetscObject)thi,&comm));
1364   PetscCall(DMDAGetInfo(da,0, &mz,&my,&mx, 0,0,0, 0,0,0,0,0,0));
1365   PetscCallMPI(MPI_Comm_size(comm,&size));
1366   PetscCallMPI(MPI_Comm_rank(comm,&rank));
1367   PetscCall(PetscViewerASCIIOpen(comm,filename,&viewer));
1368   PetscCall(PetscViewerASCIIPrintf(viewer,"<VTKFile type=\"StructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n"));
1369   PetscCall(PetscViewerASCIIPrintf(viewer,"  <StructuredGrid WholeExtent=\"%d %" PetscInt_FMT " %d %" PetscInt_FMT " %d %" PetscInt_FMT "\">\n",0,mz-1,0,my-1,0,mx-1));
1370 
1371   PetscCall(DMDAGetCorners(da,range,range+1,range+2,range+3,range+4,range+5));
1372   PetscCall(PetscMPIIntCast(range[3]*range[4]*range[5]*dof,&nn));
1373   PetscCallMPI(MPI_Reduce(&nn,&nmax,1,MPI_INT,MPI_MAX,0,comm));
1374   tag  = ((PetscObject) viewer)->tag;
1375   PetscCall(VecGetArrayRead(X,&x));
1376   if (rank == 0) {
1377     PetscScalar *array;
1378     PetscCall(PetscMalloc1(nmax,&array));
1379     for (r=0; r<size; r++) {
1380       PetscInt          i,j,k,xs,xm,ys,ym,zs,zm;
1381       const PetscScalar *ptr;
1382       MPI_Status        status;
1383       if (r) {
1384         PetscCallMPI(MPI_Recv(range,6,MPIU_INT,r,tag,comm,MPI_STATUS_IGNORE));
1385       }
1386       zs = range[0];ys = range[1];xs = range[2];zm = range[3];ym = range[4];xm = range[5];
1387       PetscCheck(xm*ym*zm*dof <= nmax,PETSC_COMM_SELF,PETSC_ERR_PLIB,"should not happen");
1388       if (r) {
1389         PetscCallMPI(MPI_Recv(array,nmax,MPIU_SCALAR,r,tag,comm,&status));
1390         PetscCallMPI(MPI_Get_count(&status,MPIU_SCALAR,&nn));
1391         PetscCheck(nn == xm*ym*zm*dof,PETSC_COMM_SELF,PETSC_ERR_PLIB,"should not happen");
1392         ptr = array;
1393       } else ptr = x;
1394       PetscCall(PetscViewerASCIIPrintf(viewer,"    <Piece Extent=\"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\">\n",zs,zs+zm-1,ys,ys+ym-1,xs,xs+xm-1));
1395 
1396       PetscCall(PetscViewerASCIIPrintf(viewer,"      <Points>\n"));
1397       PetscCall(PetscViewerASCIIPrintf(viewer,"        <DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"ascii\">\n"));
1398       for (i=xs; i<xs+xm; i++) {
1399         for (j=ys; j<ys+ym; j++) {
1400           for (k=zs; k<zs+zm; k++) {
1401             PrmNode   p;
1402             PetscReal xx = thi->Lx*i/mx,yy = thi->Ly*j/my,zz;
1403             thi->initialize(thi,xx,yy,&p);
1404             zz   = PetscRealPart(p.b) + PetscRealPart(p.h)*k/(mz-1);
1405             PetscCall(PetscViewerASCIIPrintf(viewer,"%f %f %f\n",(double)xx,(double)yy,(double)zz));
1406           }
1407         }
1408       }
1409       PetscCall(PetscViewerASCIIPrintf(viewer,"        </DataArray>\n"));
1410       PetscCall(PetscViewerASCIIPrintf(viewer,"      </Points>\n"));
1411 
1412       PetscCall(PetscViewerASCIIPrintf(viewer,"      <PointData>\n"));
1413       PetscCall(PetscViewerASCIIPrintf(viewer,"        <DataArray type=\"Float32\" Name=\"velocity\" NumberOfComponents=\"3\" format=\"ascii\">\n"));
1414       for (i=0; i<nn; i+=dof) {
1415         PetscCall(PetscViewerASCIIPrintf(viewer,"%f %f %f\n",(double)(PetscRealPart(ptr[i])*units->year/units->meter),(double)(PetscRealPart(ptr[i+1])*units->year/units->meter),0.0));
1416       }
1417       PetscCall(PetscViewerASCIIPrintf(viewer,"        </DataArray>\n"));
1418 
1419       PetscCall(PetscViewerASCIIPrintf(viewer,"        <DataArray type=\"Int32\" Name=\"rank\" NumberOfComponents=\"1\" format=\"ascii\">\n"));
1420       for (i=0; i<nn; i+=dof) {
1421         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT "\n",r));
1422       }
1423       PetscCall(PetscViewerASCIIPrintf(viewer,"        </DataArray>\n"));
1424       PetscCall(PetscViewerASCIIPrintf(viewer,"      </PointData>\n"));
1425 
1426       PetscCall(PetscViewerASCIIPrintf(viewer,"    </Piece>\n"));
1427     }
1428     PetscCall(PetscFree(array));
1429   } else {
1430     PetscCallMPI(MPI_Send(range,6,MPIU_INT,0,tag,comm));
1431     PetscCallMPI(MPI_Send((PetscScalar*)x,nn,MPIU_SCALAR,0,tag,comm));
1432   }
1433   PetscCall(VecRestoreArrayRead(X,&x));
1434   PetscCall(PetscViewerASCIIPrintf(viewer,"  </StructuredGrid>\n"));
1435   PetscCall(PetscViewerASCIIPrintf(viewer,"</VTKFile>\n"));
1436   PetscCall(PetscViewerDestroy(&viewer));
1437   PetscFunctionReturn(0);
1438 }
1439 
1440 int main(int argc,char *argv[])
1441 {
1442   MPI_Comm       comm;
1443   THI            thi;
1444   DM             da;
1445   SNES           snes;
1446 
1447   PetscCall(PetscInitialize(&argc,&argv,0,help));
1448   comm = PETSC_COMM_WORLD;
1449 
1450   PetscCall(THICreate(comm,&thi));
1451   {
1452     PetscInt M = 3,N = 3,P = 2;
1453     PetscOptionsBegin(comm,NULL,"Grid resolution options","");
1454     {
1455       PetscCall(PetscOptionsInt("-M","Number of elements in x-direction on coarse level","",M,&M,NULL));
1456       N    = M;
1457       PetscCall(PetscOptionsInt("-N","Number of elements in y-direction on coarse level (if different from M)","",N,&N,NULL));
1458       if (thi->coarse2d) {
1459         PetscCall(PetscOptionsInt("-zlevels","Number of elements in z-direction on fine level","",thi->zlevels,&thi->zlevels,NULL));
1460       } else {
1461         PetscCall(PetscOptionsInt("-P","Number of elements in z-direction on coarse level","",P,&P,NULL));
1462       }
1463     }
1464     PetscOptionsEnd();
1465     if (thi->coarse2d) {
1466       PetscCall(DMDACreate2d(comm,DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC,DMDA_STENCIL_BOX,N,M,PETSC_DETERMINE,PETSC_DETERMINE,sizeof(Node)/sizeof(PetscScalar),1,0,0,&da));
1467       PetscCall(DMSetFromOptions(da));
1468       PetscCall(DMSetUp(da));
1469       da->ops->refinehierarchy     = DMRefineHierarchy_THI;
1470       da->ops->createinterpolation = DMCreateInterpolation_DA_THI;
1471 
1472       PetscCall(PetscObjectCompose((PetscObject)da,"THI",(PetscObject)thi));
1473     } else {
1474       PetscCall(DMDACreate3d(comm,DM_BOUNDARY_NONE,DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC, DMDA_STENCIL_BOX,P,N,M,1,PETSC_DETERMINE,PETSC_DETERMINE,sizeof(Node)/sizeof(PetscScalar),1,0,0,0,&da));
1475       PetscCall(DMSetFromOptions(da));
1476       PetscCall(DMSetUp(da));
1477     }
1478     PetscCall(DMDASetFieldName(da,0,"x-velocity"));
1479     PetscCall(DMDASetFieldName(da,1,"y-velocity"));
1480   }
1481   PetscCall(THISetUpDM(thi,da));
1482   if (thi->tridiagonal) da->ops->creatematrix = DMCreateMatrix_THI_Tridiagonal;
1483 
1484   {                             /* Set the fine level matrix type if -da_refine */
1485     PetscInt rlevel,clevel;
1486     PetscCall(DMGetRefineLevel(da,&rlevel));
1487     PetscCall(DMGetCoarsenLevel(da,&clevel));
1488     if (rlevel - clevel > 0) PetscCall(DMSetMatType(da,thi->mattype));
1489   }
1490 
1491   PetscCall(DMDASNESSetFunctionLocal(da,ADD_VALUES,(DMDASNESFunction)THIFunctionLocal,thi));
1492   if (thi->tridiagonal) {
1493     PetscCall(DMDASNESSetJacobianLocal(da,(DMDASNESJacobian)THIJacobianLocal_3D_Tridiagonal,thi));
1494   } else {
1495     PetscCall(DMDASNESSetJacobianLocal(da,(DMDASNESJacobian)THIJacobianLocal_3D_Full,thi));
1496   }
1497   PetscCall(DMCoarsenHookAdd(da,DMCoarsenHook_THI,NULL,thi));
1498   PetscCall(DMRefineHookAdd(da,DMRefineHook_THI,NULL,thi));
1499 
1500   PetscCall(DMSetApplicationContext(da,thi));
1501 
1502   PetscCall(SNESCreate(comm,&snes));
1503   PetscCall(SNESSetDM(snes,da));
1504   PetscCall(DMDestroy(&da));
1505   PetscCall(SNESSetComputeInitialGuess(snes,THIInitial,NULL));
1506   PetscCall(SNESSetFromOptions(snes));
1507 
1508   PetscCall(SNESSolve(snes,NULL,NULL));
1509 
1510   PetscCall(THISolveStatistics(thi,snes,0,"Full"));
1511 
1512   {
1513     PetscBool flg;
1514     char      filename[PETSC_MAX_PATH_LEN] = "";
1515     PetscCall(PetscOptionsGetString(NULL,NULL,"-o",filename,sizeof(filename),&flg));
1516     if (flg) {
1517       Vec X;
1518       DM  dm;
1519       PetscCall(SNESGetSolution(snes,&X));
1520       PetscCall(SNESGetDM(snes,&dm));
1521       PetscCall(THIDAVecView_VTK_XML(thi,dm,X,filename));
1522     }
1523   }
1524 
1525   PetscCall(DMDestroy(&da));
1526   PetscCall(SNESDestroy(&snes));
1527   PetscCall(THIDestroy(&thi));
1528   PetscCall(PetscFinalize());
1529   return 0;
1530 }
1531 
1532 /*TEST
1533 
1534    build:
1535       requires: !single
1536 
1537    test:
1538       args: -M 6 -P 4 -da_refine 1 -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type sbaij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type icc
1539 
1540    test:
1541       suffix: 2
1542       nsize: 2
1543       args: -M 6 -P 4 -thi_hom z -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type sbaij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type asm -mg_levels_pc_asm_blocks 6 -mg_levels_0_pc_type redundant -snes_grid_sequence 1 -mat_partitioning_type current -ksp_atol -1
1544 
1545    test:
1546       suffix: 3
1547       nsize: 3
1548       args: -M 7 -P 4 -thi_hom z -da_refine 1 -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type baij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_pc_asm_type restrict -mg_levels_pc_type asm -mg_levels_pc_asm_blocks 9 -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mat_partitioning_type current
1549 
1550    test:
1551       suffix: 4
1552       nsize: 6
1553       args: -M 4 -P 2 -da_refine_hierarchy_x 1,1,3 -da_refine_hierarchy_y 2,2,1 -da_refine_hierarchy_z 2,2,1 -snes_grid_sequence 3 -ksp_converged_reason -ksp_type fgmres -ksp_rtol 1e-2 -pc_type mg -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type bjacobi -mg_levels_1_sub_pc_type cholesky -pc_mg_type multiplicative -snes_converged_reason -snes_stol 1e-12 -thi_L 80e3 -thi_alpha 0.05 -thi_friction_m 1 -thi_hom x -snes_view -mg_levels_0_pc_type redundant -mg_levels_0_ksp_type preonly -ksp_atol -1
1554 
1555    test:
1556       suffix: 5
1557       nsize: 6
1558       args: -M 12 -P 5 -snes_monitor_short -ksp_converged_reason -pc_type asm -pc_asm_type restrict -dm_mat_type {{aij baij sbaij}}
1559 
1560 TEST*/
1561