xref: /petsc/src/snes/tutorials/ex48.c (revision 4663dae6020ed40d417a3314b7d84f74d0cdce91)
1 static const char help[] = "Toy hydrostatic ice flow with multigrid in 3D.\n\
2 \n\
3 Solves the hydrostatic (aka Blatter/Pattyn/First Order) equations for ice sheet flow\n\
4 using multigrid.  The ice uses a power-law rheology with \"Glen\" exponent 3 (corresponds\n\
5 to p=4/3 in a p-Laplacian).  The focus is on ISMIP-HOM experiments which assume periodic\n\
6 boundary conditions in the x- and y-directions.\n\
7 \n\
8 Equations are rescaled so that the domain size and solution are O(1), details of this scaling\n\
9 can be controlled by the options -units_meter, -units_second, and -units_kilogram.\n\
10 \n\
11 A VTK StructuredGrid output file can be written using the option -o filename.vts\n\
12 \n\n";
13 
14 /*
15 The equations for horizontal velocity (u,v) are
16 
17   - [eta (4 u_x + 2 v_y)]_x - [eta (u_y + v_x)]_y - [eta u_z]_z + rho g s_x = 0
18   - [eta (4 v_y + 2 u_x)]_y - [eta (u_y + v_x)]_x - [eta v_z]_z + rho g s_y = 0
19 
20 where
21 
22   eta = B/2 (epsilon + gamma)^((p-2)/2)
23 
24 is the nonlinear effective viscosity with regularization epsilon and hardness parameter B,
25 written in terms of the second invariant
26 
27   gamma = u_x^2 + v_y^2 + u_x v_y + (1/4) (u_y + v_x)^2 + (1/4) u_z^2 + (1/4) v_z^2
28 
29 The surface boundary conditions are the natural conditions.  The basal boundary conditions
30 are either no-slip, or Navier (linear) slip with spatially variant friction coefficient beta^2.
31 
32 In the code, the equations for (u,v) are multiplied through by 1/(rho g) so that residuals are O(1).
33 
34 The discretization is Q1 finite elements, managed by a DMDA.  The grid is never distorted in the
35 map (x,y) plane, but the bed and surface may be bumpy.  This is handled as usual in FEM, through
36 the Jacobian of the coordinate transformation from a reference element to the physical element.
37 
38 Since ice-flow is tightly coupled in the z-direction (within columns), the DMDA is managed
39 specially so that columns are never distributed, and are always contiguous in memory.
40 This amounts to reversing the meaning of X,Y,Z compared to the DMDA's internal interpretation,
41 and then indexing as vec[i][j][k].  The exotic coarse spaces require 2D DMDAs which are made to
42 use compatible domain decomposition relative to the 3D DMDAs.
43 
44 There are two compile-time options:
45 
46   NO_SSE2:
47     If the host supports SSE2, we use integration code that has been vectorized with SSE2
48     intrinsics, unless this macro is defined.  The intrinsics speed up integration by about
49     30% on my architecture (P8700, gcc-4.5 snapshot).
50 
51   COMPUTE_LOWER_TRIANGULAR:
52     The element matrices we assemble are lower-triangular so it is not necessary to compute
53     all entries explicitly.  If this macro is defined, the lower-triangular entries are
54     computed explicitly.
55 
56 */
57 
58 #if defined(PETSC_APPLE_FRAMEWORK)
59 #import <PETSc/petscsnes.h>
60 #import <PETSc/petsc/private/dmdaimpl.h>     /* There is not yet a public interface to manipulate dm->ops */
61 #else
62 
63 #include <petscsnes.h>
64 #include <petsc/private/dmdaimpl.h>     /* There is not yet a public interface to manipulate dm->ops */
65 #endif
66 #include <ctype.h>              /* toupper() */
67 
68 #if defined(__cplusplus) || defined (PETSC_HAVE_WINDOWS_COMPILERS) || defined (__PGI)
69 /*  c++ cannot handle  [_restrict_] notation like C does */
70 #undef PETSC_RESTRICT
71 #define PETSC_RESTRICT
72 #endif
73 
74 #if defined __SSE2__
75 #  include <emmintrin.h>
76 #endif
77 
78 /* The SSE2 kernels are only for PetscScalar=double on architectures that support it */
79 #if !defined NO_SSE2                           \
80      && !defined PETSC_USE_COMPLEX             \
81      && !defined PETSC_USE_REAL_SINGLE         \
82      && !defined PETSC_USE_REAL___FLOAT128     \
83      && !defined PETSC_USE_REAL___FP16         \
84      && defined __SSE2__
85 #define USE_SSE2_KERNELS 1
86 #else
87 #define USE_SSE2_KERNELS 0
88 #endif
89 
90 static PetscClassId THI_CLASSID;
91 
92 typedef enum {QUAD_GAUSS,QUAD_LOBATTO} QuadratureType;
93 static const char      *QuadratureTypes[] = {"gauss","lobatto","QuadratureType","QUAD_",0};
94 PETSC_UNUSED static const PetscReal HexQWeights[8]     = {1,1,1,1,1,1,1,1};
95 PETSC_UNUSED static const PetscReal HexQNodes[]        = {-0.57735026918962573, 0.57735026918962573};
96 #define G 0.57735026918962573
97 #define H (0.5*(1.+G))
98 #define L (0.5*(1.-G))
99 #define M (-0.5)
100 #define P (0.5)
101 /* Special quadrature: Lobatto in horizontal, Gauss in vertical */
102 static const PetscReal HexQInterp_Lobatto[8][8] = {{H,0,0,0,L,0,0,0},
103                                                    {0,H,0,0,0,L,0,0},
104                                                    {0,0,H,0,0,0,L,0},
105                                                    {0,0,0,H,0,0,0,L},
106                                                    {L,0,0,0,H,0,0,0},
107                                                    {0,L,0,0,0,H,0,0},
108                                                    {0,0,L,0,0,0,H,0},
109                                                    {0,0,0,L,0,0,0,H}};
110 static const PetscReal HexQDeriv_Lobatto[8][8][3] = {
111   {{M*H,M*H,M},{P*H,0,0}  ,{0,0,0}    ,{0,P*H,0}  ,{M*L,M*L,P},{P*L,0,0}  ,{0,0,0}    ,{0,P*L,0}  },
112   {{M*H,0,0}  ,{P*H,M*H,M},{0,P*H,0}  ,{0,0,0}    ,{M*L,0,0}  ,{P*L,M*L,P},{0,P*L,0}  ,{0,0,0}    },
113   {{0,0,0}    ,{0,M*H,0}  ,{P*H,P*H,M},{M*H,0,0}  ,{0,0,0}    ,{0,M*L,0}  ,{P*L,P*L,P},{M*L,0,0}  },
114   {{0,M*H,0}  ,{0,0,0}    ,{P*H,0,0}  ,{M*H,P*H,M},{0,M*L,0}  ,{0,0,0}    ,{P*L,0,0}  ,{M*L,P*L,P}},
115   {{M*L,M*L,M},{P*L,0,0}  ,{0,0,0}    ,{0,P*L,0}  ,{M*H,M*H,P},{P*H,0,0}  ,{0,0,0}    ,{0,P*H,0}  },
116   {{M*L,0,0}  ,{P*L,M*L,M},{0,P*L,0}  ,{0,0,0}    ,{M*H,0,0}  ,{P*H,M*H,P},{0,P*H,0}  ,{0,0,0}    },
117   {{0,0,0}    ,{0,M*L,0}  ,{P*L,P*L,M},{M*L,0,0}  ,{0,0,0}    ,{0,M*H,0}  ,{P*H,P*H,P},{M*H,0,0}  },
118   {{0,M*L,0}  ,{0,0,0}    ,{P*L,0,0}  ,{M*L,P*L,M},{0,M*H,0}  ,{0,0,0}    ,{P*H,0,0}  ,{M*H,P*H,P}}};
119 /* Stanndard Gauss */
120 static const PetscReal HexQInterp_Gauss[8][8] = {{H*H*H,L*H*H,L*L*H,H*L*H, H*H*L,L*H*L,L*L*L,H*L*L},
121                                                  {L*H*H,H*H*H,H*L*H,L*L*H, L*H*L,H*H*L,H*L*L,L*L*L},
122                                                  {L*L*H,H*L*H,H*H*H,L*H*H, L*L*L,H*L*L,H*H*L,L*H*L},
123                                                  {H*L*H,L*L*H,L*H*H,H*H*H, H*L*L,L*L*L,L*H*L,H*H*L},
124                                                  {H*H*L,L*H*L,L*L*L,H*L*L, H*H*H,L*H*H,L*L*H,H*L*H},
125                                                  {L*H*L,H*H*L,H*L*L,L*L*L, L*H*H,H*H*H,H*L*H,L*L*H},
126                                                  {L*L*L,H*L*L,H*H*L,L*H*L, L*L*H,H*L*H,H*H*H,L*H*H},
127                                                  {H*L*L,L*L*L,L*H*L,H*H*L, H*L*H,L*L*H,L*H*H,H*H*H}};
128 static const PetscReal HexQDeriv_Gauss[8][8][3] = {
129   {{M*H*H,H*M*H,H*H*M},{P*H*H,L*M*H,L*H*M},{P*L*H,L*P*H,L*L*M},{M*L*H,H*P*H,H*L*M}, {M*H*L,H*M*L,H*H*P},{P*H*L,L*M*L,L*H*P},{P*L*L,L*P*L,L*L*P},{M*L*L,H*P*L,H*L*P}},
130   {{M*H*H,L*M*H,L*H*M},{P*H*H,H*M*H,H*H*M},{P*L*H,H*P*H,H*L*M},{M*L*H,L*P*H,L*L*M}, {M*H*L,L*M*L,L*H*P},{P*H*L,H*M*L,H*H*P},{P*L*L,H*P*L,H*L*P},{M*L*L,L*P*L,L*L*P}},
131   {{M*L*H,L*M*H,L*L*M},{P*L*H,H*M*H,H*L*M},{P*H*H,H*P*H,H*H*M},{M*H*H,L*P*H,L*H*M}, {M*L*L,L*M*L,L*L*P},{P*L*L,H*M*L,H*L*P},{P*H*L,H*P*L,H*H*P},{M*H*L,L*P*L,L*H*P}},
132   {{M*L*H,H*M*H,H*L*M},{P*L*H,L*M*H,L*L*M},{P*H*H,L*P*H,L*H*M},{M*H*H,H*P*H,H*H*M}, {M*L*L,H*M*L,H*L*P},{P*L*L,L*M*L,L*L*P},{P*H*L,L*P*L,L*H*P},{M*H*L,H*P*L,H*H*P}},
133   {{M*H*L,H*M*L,H*H*M},{P*H*L,L*M*L,L*H*M},{P*L*L,L*P*L,L*L*M},{M*L*L,H*P*L,H*L*M}, {M*H*H,H*M*H,H*H*P},{P*H*H,L*M*H,L*H*P},{P*L*H,L*P*H,L*L*P},{M*L*H,H*P*H,H*L*P}},
134   {{M*H*L,L*M*L,L*H*M},{P*H*L,H*M*L,H*H*M},{P*L*L,H*P*L,H*L*M},{M*L*L,L*P*L,L*L*M}, {M*H*H,L*M*H,L*H*P},{P*H*H,H*M*H,H*H*P},{P*L*H,H*P*H,H*L*P},{M*L*H,L*P*H,L*L*P}},
135   {{M*L*L,L*M*L,L*L*M},{P*L*L,H*M*L,H*L*M},{P*H*L,H*P*L,H*H*M},{M*H*L,L*P*L,L*H*M}, {M*L*H,L*M*H,L*L*P},{P*L*H,H*M*H,H*L*P},{P*H*H,H*P*H,H*H*P},{M*H*H,L*P*H,L*H*P}},
136   {{M*L*L,H*M*L,H*L*M},{P*L*L,L*M*L,L*L*M},{P*H*L,L*P*L,L*H*M},{M*H*L,H*P*L,H*H*M}, {M*L*H,H*M*H,H*L*P},{P*L*H,L*M*H,L*L*P},{P*H*H,L*P*H,L*H*P},{M*H*H,H*P*H,H*H*P}}};
137 static const PetscReal (*HexQInterp)[8],(*HexQDeriv)[8][3];
138 /* Standard 2x2 Gauss quadrature for the bottom layer. */
139 static const PetscReal QuadQInterp[4][4] = {{H*H,L*H,L*L,H*L},
140                                             {L*H,H*H,H*L,L*L},
141                                             {L*L,H*L,H*H,L*H},
142                                             {H*L,L*L,L*H,H*H}};
143 static const PetscReal QuadQDeriv[4][4][2] = {
144   {{M*H,M*H},{P*H,M*L},{P*L,P*L},{M*L,P*H}},
145   {{M*H,M*L},{P*H,M*H},{P*L,P*H},{M*L,P*L}},
146   {{M*L,M*L},{P*L,M*H},{P*H,P*H},{M*H,P*L}},
147   {{M*L,M*H},{P*L,M*L},{P*H,P*L},{M*H,P*H}}};
148 #undef G
149 #undef H
150 #undef L
151 #undef M
152 #undef P
153 
154 #define HexExtract(x,i,j,k,n) do {              \
155     (n)[0] = (x)[i][j][k];                      \
156     (n)[1] = (x)[i+1][j][k];                    \
157     (n)[2] = (x)[i+1][j+1][k];                  \
158     (n)[3] = (x)[i][j+1][k];                    \
159     (n)[4] = (x)[i][j][k+1];                    \
160     (n)[5] = (x)[i+1][j][k+1];                  \
161     (n)[6] = (x)[i+1][j+1][k+1];                \
162     (n)[7] = (x)[i][j+1][k+1];                  \
163   } while (0)
164 
165 #define HexExtractRef(x,i,j,k,n) do {           \
166     (n)[0] = &(x)[i][j][k];                     \
167     (n)[1] = &(x)[i+1][j][k];                   \
168     (n)[2] = &(x)[i+1][j+1][k];                 \
169     (n)[3] = &(x)[i][j+1][k];                   \
170     (n)[4] = &(x)[i][j][k+1];                   \
171     (n)[5] = &(x)[i+1][j][k+1];                 \
172     (n)[6] = &(x)[i+1][j+1][k+1];               \
173     (n)[7] = &(x)[i][j+1][k+1];                 \
174   } while (0)
175 
176 #define QuadExtract(x,i,j,n) do {               \
177     (n)[0] = (x)[i][j];                         \
178     (n)[1] = (x)[i+1][j];                       \
179     (n)[2] = (x)[i+1][j+1];                     \
180     (n)[3] = (x)[i][j+1];                       \
181   } while (0)
182 
183 static void HexGrad(const PetscReal dphi[][3],const PetscReal zn[],PetscReal dz[])
184 {
185   PetscInt i;
186   dz[0] = dz[1] = dz[2] = 0;
187   for (i=0; i<8; i++) {
188     dz[0] += dphi[i][0] * zn[i];
189     dz[1] += dphi[i][1] * zn[i];
190     dz[2] += dphi[i][2] * zn[i];
191   }
192 }
193 
194 static void HexComputeGeometry(PetscInt q,PetscReal hx,PetscReal hy,const PetscReal dz[PETSC_RESTRICT],PetscReal phi[PETSC_RESTRICT],PetscReal dphi[PETSC_RESTRICT][3],PetscReal *PETSC_RESTRICT jw)
195 {
196   const PetscReal jac[3][3]  = {{hx/2,0,0}, {0,hy/2,0}, {dz[0],dz[1],dz[2]}};
197   const PetscReal ijac[3][3] = {{1/jac[0][0],0,0}, {0,1/jac[1][1],0}, {-jac[2][0]/(jac[0][0]*jac[2][2]),-jac[2][1]/(jac[1][1]*jac[2][2]),1/jac[2][2]}};
198   const PetscReal jdet       = jac[0][0]*jac[1][1]*jac[2][2];
199   PetscInt        i;
200 
201   for (i=0; i<8; i++) {
202     const PetscReal *dphir = HexQDeriv[q][i];
203     phi[i]     = HexQInterp[q][i];
204     dphi[i][0] = dphir[0]*ijac[0][0] + dphir[1]*ijac[1][0] + dphir[2]*ijac[2][0];
205     dphi[i][1] = dphir[0]*ijac[0][1] + dphir[1]*ijac[1][1] + dphir[2]*ijac[2][1];
206     dphi[i][2] = dphir[0]*ijac[0][2] + dphir[1]*ijac[1][2] + dphir[2]*ijac[2][2];
207   }
208   *jw = 1.0 * jdet;
209 }
210 
211 typedef struct _p_THI   *THI;
212 typedef struct _n_Units *Units;
213 
214 typedef struct {
215   PetscScalar u,v;
216 } Node;
217 
218 typedef struct {
219   PetscScalar b;                /* bed */
220   PetscScalar h;                /* thickness */
221   PetscScalar beta2;            /* friction */
222 } PrmNode;
223 
224 typedef struct {
225   PetscReal min,max,cmin,cmax;
226 } PRange;
227 
228 typedef enum {THIASSEMBLY_TRIDIAGONAL,THIASSEMBLY_FULL} THIAssemblyMode;
229 
230 struct _p_THI {
231   PETSCHEADER(int);
232   void      (*initialize)(THI,PetscReal x,PetscReal y,PrmNode *p);
233   PetscInt  zlevels;
234   PetscReal Lx,Ly,Lz;           /* Model domain */
235   PetscReal alpha;              /* Bed angle */
236   Units     units;
237   PetscReal dirichlet_scale;
238   PetscReal ssa_friction_scale;
239   PRange    eta;
240   PRange    beta2;
241   struct {
242     PetscReal Bd2,eps,exponent;
243   } viscosity;
244   struct {
245     PetscReal irefgam,eps2,exponent,refvel,epsvel;
246   } friction;
247   PetscReal rhog;
248   PetscBool no_slip;
249   PetscBool tridiagonal;
250   PetscBool coarse2d;
251   PetscBool verbose;
252   MatType   mattype;
253 };
254 
255 struct _n_Units {
256   /* fundamental */
257   PetscReal meter;
258   PetscReal kilogram;
259   PetscReal second;
260   /* derived */
261   PetscReal Pascal;
262   PetscReal year;
263 };
264 
265 static PetscErrorCode THIJacobianLocal_3D_Full(DMDALocalInfo*,Node***,Mat,Mat,THI);
266 static PetscErrorCode THIJacobianLocal_3D_Tridiagonal(DMDALocalInfo*,Node***,Mat,Mat,THI);
267 static PetscErrorCode THIJacobianLocal_2D(DMDALocalInfo*,Node**,Mat,Mat,THI);
268 
269 static void PrmHexGetZ(const PrmNode pn[],PetscInt k,PetscInt zm,PetscReal zn[])
270 {
271   const PetscScalar zm1    = zm-1,
272                     znl[8] = {pn[0].b + pn[0].h*(PetscScalar)k/zm1,
273                               pn[1].b + pn[1].h*(PetscScalar)k/zm1,
274                               pn[2].b + pn[2].h*(PetscScalar)k/zm1,
275                               pn[3].b + pn[3].h*(PetscScalar)k/zm1,
276                               pn[0].b + pn[0].h*(PetscScalar)(k+1)/zm1,
277                               pn[1].b + pn[1].h*(PetscScalar)(k+1)/zm1,
278                               pn[2].b + pn[2].h*(PetscScalar)(k+1)/zm1,
279                               pn[3].b + pn[3].h*(PetscScalar)(k+1)/zm1};
280   PetscInt i;
281   for (i=0; i<8; i++) zn[i] = PetscRealPart(znl[i]);
282 }
283 
284 /* Tests A and C are from the ISMIP-HOM paper (Pattyn et al. 2008) */
285 static void THIInitialize_HOM_A(THI thi,PetscReal x,PetscReal y,PrmNode *p)
286 {
287   Units     units = thi->units;
288   PetscReal s     = -x*PetscSinReal(thi->alpha);
289 
290   p->b     = s - 1000*units->meter + 500*units->meter * PetscSinReal(x*2*PETSC_PI/thi->Lx) * PetscSinReal(y*2*PETSC_PI/thi->Ly);
291   p->h     = s - p->b;
292   p->beta2 = 1e30;
293 }
294 
295 static void THIInitialize_HOM_C(THI thi,PetscReal x,PetscReal y,PrmNode *p)
296 {
297   Units     units = thi->units;
298   PetscReal s     = -x*PetscSinReal(thi->alpha);
299 
300   p->b = s - 1000*units->meter;
301   p->h = s - p->b;
302   /* tau_b = beta2 v   is a stress (Pa) */
303   p->beta2 = 1000 * (1 + PetscSinReal(x*2*PETSC_PI/thi->Lx)*PetscSinReal(y*2*PETSC_PI/thi->Ly)) * units->Pascal * units->year / units->meter;
304 }
305 
306 /* These are just toys */
307 
308 /* Same bed as test A, free slip everywhere except for a discontinuous jump to a circular sticky region in the middle. */
309 static void THIInitialize_HOM_X(THI thi,PetscReal xx,PetscReal yy,PrmNode *p)
310 {
311   Units     units = thi->units;
312   PetscReal x     = xx*2*PETSC_PI/thi->Lx - PETSC_PI,y = yy*2*PETSC_PI/thi->Ly - PETSC_PI; /* [-pi,pi] */
313   PetscReal r     = PetscSqrtReal(x*x + y*y),s = -x*PetscSinReal(thi->alpha);
314   p->b     = s - 1000*units->meter + 500*units->meter*PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
315   p->h     = s - p->b;
316   p->beta2 = 1000 * (r < 1 ? 2 : 0) * units->Pascal * units->year / units->meter;
317 }
318 
319 /* Like Z, but with 200 meter cliffs */
320 static void THIInitialize_HOM_Y(THI thi,PetscReal xx,PetscReal yy,PrmNode *p)
321 {
322   Units     units = thi->units;
323   PetscReal x     = xx*2*PETSC_PI/thi->Lx - PETSC_PI,y = yy*2*PETSC_PI/thi->Ly - PETSC_PI; /* [-pi,pi] */
324   PetscReal r     = PetscSqrtReal(x*x + y*y),s = -x*PetscSinReal(thi->alpha);
325 
326   p->b = s - 1000*units->meter + 500*units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
327   if (PetscRealPart(p->b) > -700*units->meter) p->b += 200*units->meter;
328   p->h     = s - p->b;
329   p->beta2 = 1000 * (1. + PetscSinReal(PetscSqrtReal(16*r))/PetscSqrtReal(1e-2 + 16*r)*PetscCosReal(x*3/2)*PetscCosReal(y*3/2)) * units->Pascal * units->year / units->meter;
330 }
331 
332 /* Same bed as A, smoothly varying slipperiness, similar to MATLAB's "sombrero" (uncorrelated with bathymetry) */
333 static void THIInitialize_HOM_Z(THI thi,PetscReal xx,PetscReal yy,PrmNode *p)
334 {
335   Units     units = thi->units;
336   PetscReal x     = xx*2*PETSC_PI/thi->Lx - PETSC_PI,y = yy*2*PETSC_PI/thi->Ly - PETSC_PI; /* [-pi,pi] */
337   PetscReal r     = PetscSqrtReal(x*x + y*y),s = -x*PetscSinReal(thi->alpha);
338 
339   p->b     = s - 1000*units->meter + 500*units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
340   p->h     = s - p->b;
341   p->beta2 = 1000 * (1. + PetscSinReal(PetscSqrtReal(16*r))/PetscSqrtReal(1e-2 + 16*r)*PetscCosReal(x*3/2)*PetscCosReal(y*3/2)) * units->Pascal * units->year / units->meter;
342 }
343 
344 static void THIFriction(THI thi,PetscReal rbeta2,PetscReal gam,PetscReal *beta2,PetscReal *dbeta2)
345 {
346   if (thi->friction.irefgam == 0) {
347     Units units = thi->units;
348     thi->friction.irefgam = 1./(0.5*PetscSqr(thi->friction.refvel * units->meter / units->year));
349     thi->friction.eps2    = 0.5*PetscSqr(thi->friction.epsvel * units->meter / units->year) * thi->friction.irefgam;
350   }
351   if (thi->friction.exponent == 0) {
352     *beta2  = rbeta2;
353     *dbeta2 = 0;
354   } else {
355     *beta2  = rbeta2 * PetscPowReal(thi->friction.eps2 + gam*thi->friction.irefgam,thi->friction.exponent);
356     *dbeta2 = thi->friction.exponent * *beta2 / (thi->friction.eps2 + gam*thi->friction.irefgam) * thi->friction.irefgam;
357   }
358 }
359 
360 static void THIViscosity(THI thi,PetscReal gam,PetscReal *eta,PetscReal *deta)
361 {
362   PetscReal Bd2,eps,exponent;
363   if (thi->viscosity.Bd2 == 0) {
364     Units units = thi->units;
365     const PetscReal
366       n = 3.,                                           /* Glen exponent */
367       p = 1. + 1./n,                                    /* for Stokes */
368       A = 1.e-16 * PetscPowReal(units->Pascal,-n) / units->year, /* softness parameter (Pa^{-n}/s) */
369       B = PetscPowReal(A,-1./n);                                 /* hardness parameter */
370     thi->viscosity.Bd2      = B/2;
371     thi->viscosity.exponent = (p-2)/2;
372     thi->viscosity.eps      = 0.5*PetscSqr(1e-5 / units->year);
373   }
374   Bd2      = thi->viscosity.Bd2;
375   exponent = thi->viscosity.exponent;
376   eps      = thi->viscosity.eps;
377   *eta     = Bd2 * PetscPowReal(eps + gam,exponent);
378   *deta    = exponent * (*eta) / (eps + gam);
379 }
380 
381 static void RangeUpdate(PetscReal *min,PetscReal *max,PetscReal x)
382 {
383   if (x < *min) *min = x;
384   if (x > *max) *max = x;
385 }
386 
387 static void PRangeClear(PRange *p)
388 {
389   p->cmin = p->min = 1e100;
390   p->cmax = p->max = -1e100;
391 }
392 
393 static PetscErrorCode PRangeMinMax(PRange *p,PetscReal min,PetscReal max)
394 {
395   PetscFunctionBeginUser;
396   p->cmin = min;
397   p->cmax = max;
398   if (min < p->min) p->min = min;
399   if (max > p->max) p->max = max;
400   PetscFunctionReturn(0);
401 }
402 
403 static PetscErrorCode THIDestroy(THI *thi)
404 {
405   PetscFunctionBeginUser;
406   if (!*thi) PetscFunctionReturn(0);
407   if (--((PetscObject)(*thi))->refct > 0) {*thi = 0; PetscFunctionReturn(0);}
408   PetscCall(PetscFree((*thi)->units));
409   PetscCall(PetscFree((*thi)->mattype));
410   PetscCall(PetscHeaderDestroy(thi));
411   PetscFunctionReturn(0);
412 }
413 
414 static PetscErrorCode THICreate(MPI_Comm comm,THI *inthi)
415 {
416   static PetscBool registered = PETSC_FALSE;
417   THI              thi;
418   Units            units;
419   PetscErrorCode   ierr;
420 
421   PetscFunctionBeginUser;
422   *inthi = 0;
423   if (!registered) {
424     PetscCall(PetscClassIdRegister("Toy Hydrostatic Ice",&THI_CLASSID));
425     registered = PETSC_TRUE;
426   }
427   PetscCall(PetscHeaderCreate(thi,THI_CLASSID,"THI","Toy Hydrostatic Ice","",comm,THIDestroy,0));
428 
429   PetscCall(PetscNew(&thi->units));
430   units           = thi->units;
431   units->meter    = 1e-2;
432   units->second   = 1e-7;
433   units->kilogram = 1e-12;
434 
435   ierr = PetscOptionsBegin(comm,NULL,"Scaled units options","");PetscCall(ierr);
436   {
437     PetscCall(PetscOptionsReal("-units_meter","1 meter in scaled length units","",units->meter,&units->meter,NULL));
438     PetscCall(PetscOptionsReal("-units_second","1 second in scaled time units","",units->second,&units->second,NULL));
439     PetscCall(PetscOptionsReal("-units_kilogram","1 kilogram in scaled mass units","",units->kilogram,&units->kilogram,NULL));
440   }
441   ierr          = PetscOptionsEnd();PetscCall(ierr);
442   units->Pascal = units->kilogram / (units->meter * PetscSqr(units->second));
443   units->year   = 31556926. * units->second; /* seconds per year */
444 
445   thi->Lx              = 10.e3;
446   thi->Ly              = 10.e3;
447   thi->Lz              = 1000;
448   thi->dirichlet_scale = 1;
449   thi->verbose         = PETSC_FALSE;
450 
451   ierr = PetscOptionsBegin(comm,NULL,"Toy Hydrostatic Ice options","");PetscCall(ierr);
452   {
453     QuadratureType quad       = QUAD_GAUSS;
454     char           homexp[]   = "A";
455     char           mtype[256] = MATSBAIJ;
456     PetscReal      L,m = 1.0;
457     PetscBool      flg;
458     L    = thi->Lx;
459     PetscCall(PetscOptionsReal("-thi_L","Domain size (m)","",L,&L,&flg));
460     if (flg) thi->Lx = thi->Ly = L;
461     PetscCall(PetscOptionsReal("-thi_Lx","X Domain size (m)","",thi->Lx,&thi->Lx,NULL));
462     PetscCall(PetscOptionsReal("-thi_Ly","Y Domain size (m)","",thi->Ly,&thi->Ly,NULL));
463     PetscCall(PetscOptionsReal("-thi_Lz","Z Domain size (m)","",thi->Lz,&thi->Lz,NULL));
464     PetscCall(PetscOptionsString("-thi_hom","ISMIP-HOM experiment (A or C)","",homexp,homexp,sizeof(homexp),NULL));
465     switch (homexp[0] = toupper(homexp[0])) {
466     case 'A':
467       thi->initialize = THIInitialize_HOM_A;
468       thi->no_slip    = PETSC_TRUE;
469       thi->alpha      = 0.5;
470       break;
471     case 'C':
472       thi->initialize = THIInitialize_HOM_C;
473       thi->no_slip    = PETSC_FALSE;
474       thi->alpha      = 0.1;
475       break;
476     case 'X':
477       thi->initialize = THIInitialize_HOM_X;
478       thi->no_slip    = PETSC_FALSE;
479       thi->alpha      = 0.3;
480       break;
481     case 'Y':
482       thi->initialize = THIInitialize_HOM_Y;
483       thi->no_slip    = PETSC_FALSE;
484       thi->alpha      = 0.5;
485       break;
486     case 'Z':
487       thi->initialize = THIInitialize_HOM_Z;
488       thi->no_slip    = PETSC_FALSE;
489       thi->alpha      = 0.5;
490       break;
491     default:
492       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"HOM experiment '%c' not implemented",homexp[0]);
493     }
494     PetscCall(PetscOptionsEnum("-thi_quadrature","Quadrature to use for 3D elements","",QuadratureTypes,(PetscEnum)quad,(PetscEnum*)&quad,NULL));
495     switch (quad) {
496     case QUAD_GAUSS:
497       HexQInterp = HexQInterp_Gauss;
498       HexQDeriv  = HexQDeriv_Gauss;
499       break;
500     case QUAD_LOBATTO:
501       HexQInterp = HexQInterp_Lobatto;
502       HexQDeriv  = HexQDeriv_Lobatto;
503       break;
504     }
505     PetscCall(PetscOptionsReal("-thi_alpha","Bed angle (degrees)","",thi->alpha,&thi->alpha,NULL));
506 
507     thi->friction.refvel = 100.;
508     thi->friction.epsvel = 1.;
509 
510     PetscCall(PetscOptionsReal("-thi_friction_refvel","Reference velocity for sliding","",thi->friction.refvel,&thi->friction.refvel,NULL));
511     PetscCall(PetscOptionsReal("-thi_friction_epsvel","Regularization velocity for sliding","",thi->friction.epsvel,&thi->friction.epsvel,NULL));
512     PetscCall(PetscOptionsReal("-thi_friction_m","Friction exponent, 0=Coulomb, 1=Navier","",m,&m,NULL));
513 
514     thi->friction.exponent = (m-1)/2;
515 
516     PetscCall(PetscOptionsReal("-thi_dirichlet_scale","Scale Dirichlet boundary conditions by this factor","",thi->dirichlet_scale,&thi->dirichlet_scale,NULL));
517     PetscCall(PetscOptionsReal("-thi_ssa_friction_scale","Scale slip boundary conditions by this factor in SSA (2D) assembly","",thi->ssa_friction_scale,&thi->ssa_friction_scale,NULL));
518     PetscCall(PetscOptionsBool("-thi_coarse2d","Use a 2D coarse space corresponding to SSA","",thi->coarse2d,&thi->coarse2d,NULL));
519     PetscCall(PetscOptionsBool("-thi_tridiagonal","Assemble a tridiagonal system (column coupling only) on the finest level","",thi->tridiagonal,&thi->tridiagonal,NULL));
520     PetscCall(PetscOptionsFList("-thi_mat_type","Matrix type","MatSetType",MatList,mtype,(char*)mtype,sizeof(mtype),NULL));
521     PetscCall(PetscStrallocpy(mtype,(char**)&thi->mattype));
522     PetscCall(PetscOptionsBool("-thi_verbose","Enable verbose output (like matrix sizes and statistics)","",thi->verbose,&thi->verbose,NULL));
523   }
524   ierr = PetscOptionsEnd();PetscCall(ierr);
525 
526   /* dimensionalize */
527   thi->Lx    *= units->meter;
528   thi->Ly    *= units->meter;
529   thi->Lz    *= units->meter;
530   thi->alpha *= PETSC_PI / 180;
531 
532   PRangeClear(&thi->eta);
533   PRangeClear(&thi->beta2);
534 
535   {
536     PetscReal u       = 1000*units->meter/(3e7*units->second),
537               gradu   = u / (100*units->meter),eta,deta,
538               rho     = 910 * units->kilogram/PetscPowReal(units->meter,3),
539               grav    = 9.81 * units->meter/PetscSqr(units->second),
540               driving = rho * grav * PetscSinReal(thi->alpha) * 1000*units->meter;
541     THIViscosity(thi,0.5*gradu*gradu,&eta,&deta);
542     thi->rhog = rho * grav;
543     if (thi->verbose) {
544       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Units: meter %8.2g  second %8.2g  kg %8.2g  Pa %8.2g\n",(double)units->meter,(double)units->second,(double)units->kilogram,(double)units->Pascal));
545       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Domain (%6.2g,%6.2g,%6.2g), pressure %8.2g, driving stress %8.2g\n",(double)thi->Lx,(double)thi->Ly,(double)thi->Lz,(double)(rho*grav*1e3*units->meter),(double)driving));
546       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Large velocity 1km/a %8.2g, velocity gradient %8.2g, eta %8.2g, stress %8.2g, ratio %8.2g\n",(double)u,(double)gradu,(double)eta,(double)(2*eta*gradu),(double)(2*eta*gradu/driving)));
547       THIViscosity(thi,0.5*PetscSqr(1e-3*gradu),&eta,&deta);
548       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Small velocity 1m/a  %8.2g, velocity gradient %8.2g, eta %8.2g, stress %8.2g, ratio %8.2g\n",(double)(1e-3*u),(double)(1e-3*gradu),(double)eta,(double)(2*eta*1e-3*gradu),(double)(2*eta*1e-3*gradu/driving)));
549     }
550   }
551 
552   *inthi = thi;
553   PetscFunctionReturn(0);
554 }
555 
556 static PetscErrorCode THIInitializePrm(THI thi,DM da2prm,Vec prm)
557 {
558   PrmNode        **p;
559   PetscInt       i,j,xs,xm,ys,ym,mx,my;
560 
561   PetscFunctionBeginUser;
562   PetscCall(DMDAGetGhostCorners(da2prm,&ys,&xs,0,&ym,&xm,0));
563   PetscCall(DMDAGetInfo(da2prm,0, &my,&mx,0, 0,0,0, 0,0,0,0,0,0));
564   PetscCall(DMDAVecGetArray(da2prm,prm,&p));
565   for (i=xs; i<xs+xm; i++) {
566     for (j=ys; j<ys+ym; j++) {
567       PetscReal xx = thi->Lx*i/mx,yy = thi->Ly*j/my;
568       thi->initialize(thi,xx,yy,&p[i][j]);
569     }
570   }
571   PetscCall(DMDAVecRestoreArray(da2prm,prm,&p));
572   PetscFunctionReturn(0);
573 }
574 
575 static PetscErrorCode THISetUpDM(THI thi,DM dm)
576 {
577   PetscInt        refinelevel,coarsenlevel,level,dim,Mx,My,Mz,mx,my,s;
578   DMDAStencilType st;
579   DM              da2prm;
580   Vec             X;
581 
582   PetscFunctionBeginUser;
583   PetscCall(DMDAGetInfo(dm,&dim, &Mz,&My,&Mx, 0,&my,&mx, 0,&s,0,0,0,&st));
584   if (dim == 2) {
585     PetscCall(DMDAGetInfo(dm,&dim, &My,&Mx,0, &my,&mx,0, 0,&s,0,0,0,&st));
586   }
587   PetscCall(DMGetRefineLevel(dm,&refinelevel));
588   PetscCall(DMGetCoarsenLevel(dm,&coarsenlevel));
589   level = refinelevel - coarsenlevel;
590   PetscCall(DMDACreate2d(PetscObjectComm((PetscObject)thi),DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC,st,My,Mx,my,mx,sizeof(PrmNode)/sizeof(PetscScalar),s,0,0,&da2prm));
591   PetscCall(DMSetUp(da2prm));
592   PetscCall(DMCreateLocalVector(da2prm,&X));
593   {
594     PetscReal Lx = thi->Lx / thi->units->meter,Ly = thi->Ly / thi->units->meter,Lz = thi->Lz / thi->units->meter;
595     if (dim == 2) {
596       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Level %D domain size (m) %8.2g x %8.2g, num elements %D x %D (%D), size (m) %g x %g\n",level,(double)Lx,(double)Ly,Mx,My,Mx*My,(double)(Lx/Mx),(double)(Ly/My)));
597     } else {
598       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Level %D domain size (m) %8.2g x %8.2g x %8.2g, num elements %D x %D x %D (%D), size (m) %g x %g x %g\n",level,(double)Lx,(double)Ly,(double)Lz,Mx,My,Mz,Mx*My*Mz,(double)(Lx/Mx),(double)(Ly/My),(double)(1000./(Mz-1))));
599     }
600   }
601   PetscCall(THIInitializePrm(thi,da2prm,X));
602   if (thi->tridiagonal) {       /* Reset coarse Jacobian evaluation */
603     PetscCall(DMDASNESSetJacobianLocal(dm,(DMDASNESJacobian)THIJacobianLocal_3D_Full,thi));
604   }
605   if (thi->coarse2d) {
606     PetscCall(DMDASNESSetJacobianLocal(dm,(DMDASNESJacobian)THIJacobianLocal_2D,thi));
607   }
608   PetscCall(PetscObjectCompose((PetscObject)dm,"DMDA2Prm",(PetscObject)da2prm));
609   PetscCall(PetscObjectCompose((PetscObject)dm,"DMDA2Prm_Vec",(PetscObject)X));
610   PetscCall(DMDestroy(&da2prm));
611   PetscCall(VecDestroy(&X));
612   PetscFunctionReturn(0);
613 }
614 
615 static PetscErrorCode DMCoarsenHook_THI(DM dmf,DM dmc,void *ctx)
616 {
617   THI            thi = (THI)ctx;
618   PetscInt       rlevel,clevel;
619 
620   PetscFunctionBeginUser;
621   PetscCall(THISetUpDM(thi,dmc));
622   PetscCall(DMGetRefineLevel(dmc,&rlevel));
623   PetscCall(DMGetCoarsenLevel(dmc,&clevel));
624   if (rlevel-clevel == 0) PetscCall(DMSetMatType(dmc,MATAIJ));
625   PetscCall(DMCoarsenHookAdd(dmc,DMCoarsenHook_THI,NULL,thi));
626   PetscFunctionReturn(0);
627 }
628 
629 static PetscErrorCode DMRefineHook_THI(DM dmc,DM dmf,void *ctx)
630 {
631   THI            thi = (THI)ctx;
632 
633   PetscFunctionBeginUser;
634   PetscCall(THISetUpDM(thi,dmf));
635   PetscCall(DMSetMatType(dmf,thi->mattype));
636   PetscCall(DMRefineHookAdd(dmf,DMRefineHook_THI,NULL,thi));
637   /* With grid sequencing, a formerly-refined DM will later be coarsened by PCSetUp_MG */
638   PetscCall(DMCoarsenHookAdd(dmf,DMCoarsenHook_THI,NULL,thi));
639   PetscFunctionReturn(0);
640 }
641 
642 static PetscErrorCode THIDAGetPrm(DM da,PrmNode ***prm)
643 {
644   DM             da2prm;
645   Vec            X;
646 
647   PetscFunctionBeginUser;
648   PetscCall(PetscObjectQuery((PetscObject)da,"DMDA2Prm",(PetscObject*)&da2prm));
649   PetscCheck(da2prm,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm composed with given DMDA");
650   PetscCall(PetscObjectQuery((PetscObject)da,"DMDA2Prm_Vec",(PetscObject*)&X));
651   PetscCheck(X,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm_Vec composed with given DMDA");
652   PetscCall(DMDAVecGetArray(da2prm,X,prm));
653   PetscFunctionReturn(0);
654 }
655 
656 static PetscErrorCode THIDARestorePrm(DM da,PrmNode ***prm)
657 {
658   DM             da2prm;
659   Vec            X;
660 
661   PetscFunctionBeginUser;
662   PetscCall(PetscObjectQuery((PetscObject)da,"DMDA2Prm",(PetscObject*)&da2prm));
663   PetscCheck(da2prm,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm composed with given DMDA");
664   PetscCall(PetscObjectQuery((PetscObject)da,"DMDA2Prm_Vec",(PetscObject*)&X));
665   PetscCheck(X,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm_Vec composed with given DMDA");
666   PetscCall(DMDAVecRestoreArray(da2prm,X,prm));
667   PetscFunctionReturn(0);
668 }
669 
670 static PetscErrorCode THIInitial(SNES snes,Vec X,void *ctx)
671 {
672   THI            thi;
673   PetscInt       i,j,k,xs,xm,ys,ym,zs,zm,mx,my;
674   PetscReal      hx,hy;
675   PrmNode        **prm;
676   Node           ***x;
677   DM             da;
678 
679   PetscFunctionBeginUser;
680   PetscCall(SNESGetDM(snes,&da));
681   PetscCall(DMGetApplicationContext(da,&thi));
682   PetscCall(DMDAGetInfo(da,0, 0,&my,&mx, 0,0,0, 0,0,0,0,0,0));
683   PetscCall(DMDAGetCorners(da,&zs,&ys,&xs,&zm,&ym,&xm));
684   PetscCall(DMDAVecGetArray(da,X,&x));
685   PetscCall(THIDAGetPrm(da,&prm));
686   hx   = thi->Lx / mx;
687   hy   = thi->Ly / my;
688   for (i=xs; i<xs+xm; i++) {
689     for (j=ys; j<ys+ym; j++) {
690       for (k=zs; k<zs+zm; k++) {
691         const PetscScalar zm1      = zm-1,
692                           drivingx = thi->rhog * (prm[i+1][j].b+prm[i+1][j].h - prm[i-1][j].b-prm[i-1][j].h) / (2*hx),
693                           drivingy = thi->rhog * (prm[i][j+1].b+prm[i][j+1].h - prm[i][j-1].b-prm[i][j-1].h) / (2*hy);
694         x[i][j][k].u = 0. * drivingx * prm[i][j].h*(PetscScalar)k/zm1;
695         x[i][j][k].v = 0. * drivingy * prm[i][j].h*(PetscScalar)k/zm1;
696       }
697     }
698   }
699   PetscCall(DMDAVecRestoreArray(da,X,&x));
700   PetscCall(THIDARestorePrm(da,&prm));
701   PetscFunctionReturn(0);
702 }
703 
704 static void PointwiseNonlinearity(THI thi,const Node n[PETSC_RESTRICT],const PetscReal phi[PETSC_RESTRICT],PetscReal dphi[PETSC_RESTRICT][3],PetscScalar *PETSC_RESTRICT u,PetscScalar *PETSC_RESTRICT v,PetscScalar du[PETSC_RESTRICT],PetscScalar dv[PETSC_RESTRICT],PetscReal *eta,PetscReal *deta)
705 {
706   PetscInt    l,ll;
707   PetscScalar gam;
708 
709   du[0] = du[1] = du[2] = 0;
710   dv[0] = dv[1] = dv[2] = 0;
711   *u    = 0;
712   *v    = 0;
713   for (l=0; l<8; l++) {
714     *u += phi[l] * n[l].u;
715     *v += phi[l] * n[l].v;
716     for (ll=0; ll<3; ll++) {
717       du[ll] += dphi[l][ll] * n[l].u;
718       dv[ll] += dphi[l][ll] * n[l].v;
719     }
720   }
721   gam = PetscSqr(du[0]) + PetscSqr(dv[1]) + du[0]*dv[1] + 0.25*PetscSqr(du[1]+dv[0]) + 0.25*PetscSqr(du[2]) + 0.25*PetscSqr(dv[2]);
722   THIViscosity(thi,PetscRealPart(gam),eta,deta);
723 }
724 
725 static void PointwiseNonlinearity2D(THI thi,Node n[],PetscReal phi[],PetscReal dphi[4][2],PetscScalar *u,PetscScalar *v,PetscScalar du[],PetscScalar dv[],PetscReal *eta,PetscReal *deta)
726 {
727   PetscInt    l,ll;
728   PetscScalar gam;
729 
730   du[0] = du[1] = 0;
731   dv[0] = dv[1] = 0;
732   *u    = 0;
733   *v    = 0;
734   for (l=0; l<4; l++) {
735     *u += phi[l] * n[l].u;
736     *v += phi[l] * n[l].v;
737     for (ll=0; ll<2; ll++) {
738       du[ll] += dphi[l][ll] * n[l].u;
739       dv[ll] += dphi[l][ll] * n[l].v;
740     }
741   }
742   gam = PetscSqr(du[0]) + PetscSqr(dv[1]) + du[0]*dv[1] + 0.25*PetscSqr(du[1]+dv[0]);
743   THIViscosity(thi,PetscRealPart(gam),eta,deta);
744 }
745 
746 static PetscErrorCode THIFunctionLocal(DMDALocalInfo *info,Node ***x,Node ***f,THI thi)
747 {
748   PetscInt       xs,ys,xm,ym,zm,i,j,k,q,l;
749   PetscReal      hx,hy,etamin,etamax,beta2min,beta2max;
750   PrmNode        **prm;
751 
752   PetscFunctionBeginUser;
753   xs = info->zs;
754   ys = info->ys;
755   xm = info->zm;
756   ym = info->ym;
757   zm = info->xm;
758   hx = thi->Lx / info->mz;
759   hy = thi->Ly / info->my;
760 
761   etamin   = 1e100;
762   etamax   = 0;
763   beta2min = 1e100;
764   beta2max = 0;
765 
766   PetscCall(THIDAGetPrm(info->da,&prm));
767 
768   for (i=xs; i<xs+xm; i++) {
769     for (j=ys; j<ys+ym; j++) {
770       PrmNode pn[4];
771       QuadExtract(prm,i,j,pn);
772       for (k=0; k<zm-1; k++) {
773         PetscInt  ls = 0;
774         Node      n[8],*fn[8];
775         PetscReal zn[8],etabase = 0;
776         PrmHexGetZ(pn,k,zm,zn);
777         HexExtract(x,i,j,k,n);
778         HexExtractRef(f,i,j,k,fn);
779         if (thi->no_slip && k == 0) {
780           for (l=0; l<4; l++) n[l].u = n[l].v = 0;
781           /* The first 4 basis functions lie on the bottom layer, so their contribution is exactly 0, hence we can skip them */
782           ls = 4;
783         }
784         for (q=0; q<8; q++) {
785           PetscReal   dz[3],phi[8],dphi[8][3],jw,eta,deta;
786           PetscScalar du[3],dv[3],u,v;
787           HexGrad(HexQDeriv[q],zn,dz);
788           HexComputeGeometry(q,hx,hy,dz,phi,dphi,&jw);
789           PointwiseNonlinearity(thi,n,phi,dphi,&u,&v,du,dv,&eta,&deta);
790           jw /= thi->rhog;      /* scales residuals to be O(1) */
791           if (q == 0) etabase = eta;
792           RangeUpdate(&etamin,&etamax,eta);
793           for (l=ls; l<8; l++) { /* test functions */
794             const PetscReal ds[2] = {-PetscSinReal(thi->alpha),0};
795             const PetscReal pp    = phi[l],*dp = dphi[l];
796             fn[l]->u += dp[0]*jw*eta*(4.*du[0]+2.*dv[1]) + dp[1]*jw*eta*(du[1]+dv[0]) + dp[2]*jw*eta*du[2] + pp*jw*thi->rhog*ds[0];
797             fn[l]->v += dp[1]*jw*eta*(2.*du[0]+4.*dv[1]) + dp[0]*jw*eta*(du[1]+dv[0]) + dp[2]*jw*eta*dv[2] + pp*jw*thi->rhog*ds[1];
798           }
799         }
800         if (k == 0) { /* we are on a bottom face */
801           if (thi->no_slip) {
802             /* Note: Non-Galerkin coarse grid operators are very sensitive to the scaling of Dirichlet boundary
803             * conditions.  After shenanigans above, etabase contains the effective viscosity at the closest quadrature
804             * point to the bed.  We want the diagonal entry in the Dirichlet condition to have similar magnitude to the
805             * diagonal entry corresponding to the adjacent node.  The fundamental scaling of the viscous part is in
806             * diagu, diagv below.  This scaling is easy to recognize by considering the finite difference operator after
807             * scaling by element size.  The no-slip Dirichlet condition is scaled by this factor, and also in the
808             * assembled matrix (see the similar block in THIJacobianLocal).
809             *
810             * Note that the residual at this Dirichlet node is linear in the state at this node, but also depends
811             * (nonlinearly in general) on the neighboring interior nodes through the local viscosity.  This will make
812             * a matrix-free Jacobian have extra entries in the corresponding row.  We assemble only the diagonal part,
813             * so the solution will exactly satisfy the boundary condition after the first linear iteration.
814             */
815             const PetscReal   hz    = PetscRealPart(pn[0].h)/(zm-1.);
816             const PetscScalar diagu = 2*etabase/thi->rhog*(hx*hy/hz + hx*hz/hy + 4*hy*hz/hx),diagv = 2*etabase/thi->rhog*(hx*hy/hz + 4*hx*hz/hy + hy*hz/hx);
817             fn[0]->u = thi->dirichlet_scale*diagu*x[i][j][k].u;
818             fn[0]->v = thi->dirichlet_scale*diagv*x[i][j][k].v;
819           } else {              /* Integrate over bottom face to apply boundary condition */
820             for (q=0; q<4; q++) {
821               const PetscReal jw = 0.25*hx*hy/thi->rhog,*phi = QuadQInterp[q];
822               PetscScalar     u  =0,v=0,rbeta2=0;
823               PetscReal       beta2,dbeta2;
824               for (l=0; l<4; l++) {
825                 u      += phi[l]*n[l].u;
826                 v      += phi[l]*n[l].v;
827                 rbeta2 += phi[l]*pn[l].beta2;
828               }
829               THIFriction(thi,PetscRealPart(rbeta2),PetscRealPart(u*u+v*v)/2,&beta2,&dbeta2);
830               RangeUpdate(&beta2min,&beta2max,beta2);
831               for (l=0; l<4; l++) {
832                 const PetscReal pp = phi[l];
833                 fn[ls+l]->u += pp*jw*beta2*u;
834                 fn[ls+l]->v += pp*jw*beta2*v;
835               }
836             }
837           }
838         }
839       }
840     }
841   }
842 
843   PetscCall(THIDARestorePrm(info->da,&prm));
844 
845   PetscCall(PRangeMinMax(&thi->eta,etamin,etamax));
846   PetscCall(PRangeMinMax(&thi->beta2,beta2min,beta2max));
847   PetscFunctionReturn(0);
848 }
849 
850 static PetscErrorCode THIMatrixStatistics(THI thi,Mat B,PetscViewer viewer)
851 {
852   PetscReal      nrm;
853   PetscInt       m;
854   PetscMPIInt    rank;
855 
856   PetscFunctionBeginUser;
857   PetscCall(MatNorm(B,NORM_FROBENIUS,&nrm));
858   PetscCall(MatGetSize(B,&m,0));
859   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B),&rank));
860   if (rank == 0) {
861     PetscScalar val0,val2;
862     PetscCall(MatGetValue(B,0,0,&val0));
863     PetscCall(MatGetValue(B,2,2,&val2));
864     PetscCall(PetscViewerASCIIPrintf(viewer,"Matrix dim %D norm %8.2e (0,0) %8.2e  (2,2) %8.2e %8.2e <= eta <= %8.2e %8.2e <= beta2 <= %8.2e\n",m,(double)nrm,(double)PetscRealPart(val0),(double)PetscRealPart(val2),(double)thi->eta.cmin,(double)thi->eta.cmax,(double)thi->beta2.cmin,(double)thi->beta2.cmax));
865   }
866   PetscFunctionReturn(0);
867 }
868 
869 static PetscErrorCode THISurfaceStatistics(DM da,Vec X,PetscReal *min,PetscReal *max,PetscReal *mean)
870 {
871   Node           ***x;
872   PetscInt       i,j,xs,ys,zs,xm,ym,zm,mx,my,mz;
873   PetscReal      umin = 1e100,umax=-1e100;
874   PetscScalar    usum = 0.0,gusum;
875 
876   PetscFunctionBeginUser;
877   *min = *max = *mean = 0;
878   PetscCall(DMDAGetInfo(da,0, &mz,&my,&mx, 0,0,0, 0,0,0,0,0,0));
879   PetscCall(DMDAGetCorners(da,&zs,&ys,&xs,&zm,&ym,&xm));
880   PetscCheck(zs == 0 && zm == mz,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected decomposition");
881   PetscCall(DMDAVecGetArray(da,X,&x));
882   for (i=xs; i<xs+xm; i++) {
883     for (j=ys; j<ys+ym; j++) {
884       PetscReal u = PetscRealPart(x[i][j][zm-1].u);
885       RangeUpdate(&umin,&umax,u);
886       usum += u;
887     }
888   }
889   PetscCall(DMDAVecRestoreArray(da,X,&x));
890   PetscCallMPI(MPI_Allreduce(&umin,min,1,MPIU_REAL,MPIU_MIN,PetscObjectComm((PetscObject)da)));
891   PetscCallMPI(MPI_Allreduce(&umax,max,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)da)));
892   PetscCallMPI(MPI_Allreduce(&usum,&gusum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)da)));
893   *mean = PetscRealPart(gusum) / (mx*my);
894   PetscFunctionReturn(0);
895 }
896 
897 static PetscErrorCode THISolveStatistics(THI thi,SNES snes,PetscInt coarsened,const char name[])
898 {
899   MPI_Comm       comm;
900   Vec            X;
901   DM             dm;
902 
903   PetscFunctionBeginUser;
904   PetscCall(PetscObjectGetComm((PetscObject)thi,&comm));
905   PetscCall(SNESGetSolution(snes,&X));
906   PetscCall(SNESGetDM(snes,&dm));
907   PetscCall(PetscPrintf(comm,"Solution statistics after solve: %s\n",name));
908   {
909     PetscInt            its,lits;
910     SNESConvergedReason reason;
911     PetscCall(SNESGetIterationNumber(snes,&its));
912     PetscCall(SNESGetConvergedReason(snes,&reason));
913     PetscCall(SNESGetLinearSolveIterations(snes,&lits));
914     PetscCall(PetscPrintf(comm,"%s: Number of SNES iterations = %D, total linear iterations = %D\n",SNESConvergedReasons[reason],its,lits));
915   }
916   {
917     PetscReal         nrm2,tmin[3]={1e100,1e100,1e100},tmax[3]={-1e100,-1e100,-1e100},min[3],max[3];
918     PetscInt          i,j,m;
919     const PetscScalar *x;
920     PetscCall(VecNorm(X,NORM_2,&nrm2));
921     PetscCall(VecGetLocalSize(X,&m));
922     PetscCall(VecGetArrayRead(X,&x));
923     for (i=0; i<m; i+=2) {
924       PetscReal u = PetscRealPart(x[i]),v = PetscRealPart(x[i+1]),c = PetscSqrtReal(u*u+v*v);
925       tmin[0] = PetscMin(u,tmin[0]);
926       tmin[1] = PetscMin(v,tmin[1]);
927       tmin[2] = PetscMin(c,tmin[2]);
928       tmax[0] = PetscMax(u,tmax[0]);
929       tmax[1] = PetscMax(v,tmax[1]);
930       tmax[2] = PetscMax(c,tmax[2]);
931     }
932     PetscCall(VecRestoreArrayRead(X,&x));
933     PetscCallMPI(MPI_Allreduce(tmin,min,3,MPIU_REAL,MPIU_MIN,PetscObjectComm((PetscObject)thi)));
934     PetscCallMPI(MPI_Allreduce(tmax,max,3,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)thi)));
935     /* Dimensionalize to meters/year */
936     nrm2 *= thi->units->year / thi->units->meter;
937     for (j=0; j<3; j++) {
938       min[j] *= thi->units->year / thi->units->meter;
939       max[j] *= thi->units->year / thi->units->meter;
940     }
941     if (min[0] == 0.0) min[0] = 0.0;
942     PetscCall(PetscPrintf(comm,"|X|_2 %g   %g <= u <=  %g   %g <= v <=  %g   %g <= c <=  %g \n",(double)nrm2,(double)min[0],(double)max[0],(double)min[1],(double)max[1],(double)min[2],(double)max[2]));
943     {
944       PetscReal umin,umax,umean;
945       PetscCall(THISurfaceStatistics(dm,X,&umin,&umax,&umean));
946       umin  *= thi->units->year / thi->units->meter;
947       umax  *= thi->units->year / thi->units->meter;
948       umean *= thi->units->year / thi->units->meter;
949       PetscCall(PetscPrintf(comm,"Surface statistics: u in [%12.6e, %12.6e] mean %12.6e\n",(double)umin,(double)umax,(double)umean));
950     }
951     /* These values stay nondimensional */
952     PetscCall(PetscPrintf(comm,"Global eta range   %g to %g converged range %g to %g\n",(double)thi->eta.min,(double)thi->eta.max,(double)thi->eta.cmin,(double)thi->eta.cmax));
953     PetscCall(PetscPrintf(comm,"Global beta2 range %g to %g converged range %g to %g\n",(double)thi->beta2.min,(double)thi->beta2.max,(double)thi->beta2.cmin,(double)thi->beta2.cmax));
954   }
955   PetscFunctionReturn(0);
956 }
957 
958 static PetscErrorCode THIJacobianLocal_2D(DMDALocalInfo *info,Node **x,Mat J,Mat B,THI thi)
959 {
960   PetscInt       xs,ys,xm,ym,i,j,q,l,ll;
961   PetscReal      hx,hy;
962   PrmNode        **prm;
963 
964   PetscFunctionBeginUser;
965   xs = info->ys;
966   ys = info->xs;
967   xm = info->ym;
968   ym = info->xm;
969   hx = thi->Lx / info->my;
970   hy = thi->Ly / info->mx;
971 
972   PetscCall(MatZeroEntries(B));
973   PetscCall(THIDAGetPrm(info->da,&prm));
974 
975   for (i=xs; i<xs+xm; i++) {
976     for (j=ys; j<ys+ym; j++) {
977       Node        n[4];
978       PrmNode     pn[4];
979       PetscScalar Ke[4*2][4*2];
980       QuadExtract(prm,i,j,pn);
981       QuadExtract(x,i,j,n);
982       PetscCall(PetscMemzero(Ke,sizeof(Ke)));
983       for (q=0; q<4; q++) {
984         PetscReal   phi[4],dphi[4][2],jw,eta,deta,beta2,dbeta2;
985         PetscScalar u,v,du[2],dv[2],h = 0,rbeta2 = 0;
986         for (l=0; l<4; l++) {
987           phi[l]     = QuadQInterp[q][l];
988           dphi[l][0] = QuadQDeriv[q][l][0]*2./hx;
989           dphi[l][1] = QuadQDeriv[q][l][1]*2./hy;
990           h         += phi[l] * pn[l].h;
991           rbeta2    += phi[l] * pn[l].beta2;
992         }
993         jw = 0.25*hx*hy / thi->rhog; /* rhog is only scaling */
994         PointwiseNonlinearity2D(thi,n,phi,dphi,&u,&v,du,dv,&eta,&deta);
995         THIFriction(thi,PetscRealPart(rbeta2),PetscRealPart(u*u+v*v)/2,&beta2,&dbeta2);
996         for (l=0; l<4; l++) {
997           const PetscReal pp = phi[l],*dp = dphi[l];
998           for (ll=0; ll<4; ll++) {
999             const PetscReal ppl = phi[ll],*dpl = dphi[ll];
1000             PetscScalar     dgdu,dgdv;
1001             dgdu = 2.*du[0]*dpl[0] + dv[1]*dpl[0] + 0.5*(du[1]+dv[0])*dpl[1];
1002             dgdv = 2.*dv[1]*dpl[1] + du[0]*dpl[1] + 0.5*(du[1]+dv[0])*dpl[0];
1003             /* Picard part */
1004             Ke[l*2+0][ll*2+0] += dp[0]*jw*eta*4.*dpl[0] + dp[1]*jw*eta*dpl[1] + pp*jw*(beta2/h)*ppl*thi->ssa_friction_scale;
1005             Ke[l*2+0][ll*2+1] += dp[0]*jw*eta*2.*dpl[1] + dp[1]*jw*eta*dpl[0];
1006             Ke[l*2+1][ll*2+0] += dp[1]*jw*eta*2.*dpl[0] + dp[0]*jw*eta*dpl[1];
1007             Ke[l*2+1][ll*2+1] += dp[1]*jw*eta*4.*dpl[1] + dp[0]*jw*eta*dpl[0] + pp*jw*(beta2/h)*ppl*thi->ssa_friction_scale;
1008             /* extra Newton terms */
1009             Ke[l*2+0][ll*2+0] += dp[0]*jw*deta*dgdu*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdu*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*u*u*ppl*thi->ssa_friction_scale;
1010             Ke[l*2+0][ll*2+1] += dp[0]*jw*deta*dgdv*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdv*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*u*v*ppl*thi->ssa_friction_scale;
1011             Ke[l*2+1][ll*2+0] += dp[1]*jw*deta*dgdu*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdu*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*v*u*ppl*thi->ssa_friction_scale;
1012             Ke[l*2+1][ll*2+1] += dp[1]*jw*deta*dgdv*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdv*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*v*v*ppl*thi->ssa_friction_scale;
1013           }
1014         }
1015       }
1016       {
1017         const MatStencil rc[4] = {{0,i,j,0},{0,i+1,j,0},{0,i+1,j+1,0},{0,i,j+1,0}};
1018         PetscCall(MatSetValuesBlockedStencil(B,4,rc,4,rc,&Ke[0][0],ADD_VALUES));
1019       }
1020     }
1021   }
1022   PetscCall(THIDARestorePrm(info->da,&prm));
1023 
1024   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1025   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1026   PetscCall(MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE));
1027   if (thi->verbose) PetscCall(THIMatrixStatistics(thi,B,PETSC_VIEWER_STDOUT_WORLD));
1028   PetscFunctionReturn(0);
1029 }
1030 
1031 static PetscErrorCode THIJacobianLocal_3D(DMDALocalInfo *info,Node ***x,Mat B,THI thi,THIAssemblyMode amode)
1032 {
1033   PetscInt       xs,ys,xm,ym,zm,i,j,k,q,l,ll;
1034   PetscReal      hx,hy;
1035   PrmNode        **prm;
1036 
1037   PetscFunctionBeginUser;
1038   xs = info->zs;
1039   ys = info->ys;
1040   xm = info->zm;
1041   ym = info->ym;
1042   zm = info->xm;
1043   hx = thi->Lx / info->mz;
1044   hy = thi->Ly / info->my;
1045 
1046   PetscCall(MatZeroEntries(B));
1047   PetscCall(MatSetOption(B,MAT_SUBSET_OFF_PROC_ENTRIES,PETSC_TRUE));
1048   PetscCall(THIDAGetPrm(info->da,&prm));
1049 
1050   for (i=xs; i<xs+xm; i++) {
1051     for (j=ys; j<ys+ym; j++) {
1052       PrmNode pn[4];
1053       QuadExtract(prm,i,j,pn);
1054       for (k=0; k<zm-1; k++) {
1055         Node        n[8];
1056         PetscReal   zn[8],etabase = 0;
1057         PetscScalar Ke[8*2][8*2];
1058         PetscInt    ls = 0;
1059 
1060         PrmHexGetZ(pn,k,zm,zn);
1061         HexExtract(x,i,j,k,n);
1062         PetscCall(PetscMemzero(Ke,sizeof(Ke)));
1063         if (thi->no_slip && k == 0) {
1064           for (l=0; l<4; l++) n[l].u = n[l].v = 0;
1065           ls = 4;
1066         }
1067         for (q=0; q<8; q++) {
1068           PetscReal   dz[3],phi[8],dphi[8][3],jw,eta,deta;
1069           PetscScalar du[3],dv[3],u,v;
1070           HexGrad(HexQDeriv[q],zn,dz);
1071           HexComputeGeometry(q,hx,hy,dz,phi,dphi,&jw);
1072           PointwiseNonlinearity(thi,n,phi,dphi,&u,&v,du,dv,&eta,&deta);
1073           jw /= thi->rhog;      /* residuals are scaled by this factor */
1074           if (q == 0) etabase = eta;
1075           for (l=ls; l<8; l++) { /* test functions */
1076             const PetscReal *PETSC_RESTRICT dp = dphi[l];
1077 #if USE_SSE2_KERNELS
1078             /* gcc (up to my 4.5 snapshot) is really bad at hoisting intrinsics so we do it manually */
1079             __m128d
1080               p4         = _mm_set1_pd(4),p2 = _mm_set1_pd(2),p05 = _mm_set1_pd(0.5),
1081               p42        = _mm_setr_pd(4,2),p24 = _mm_shuffle_pd(p42,p42,_MM_SHUFFLE2(0,1)),
1082               du0        = _mm_set1_pd(du[0]),du1 = _mm_set1_pd(du[1]),du2 = _mm_set1_pd(du[2]),
1083               dv0        = _mm_set1_pd(dv[0]),dv1 = _mm_set1_pd(dv[1]),dv2 = _mm_set1_pd(dv[2]),
1084               jweta      = _mm_set1_pd(jw*eta),jwdeta = _mm_set1_pd(jw*deta),
1085               dp0        = _mm_set1_pd(dp[0]),dp1 = _mm_set1_pd(dp[1]),dp2 = _mm_set1_pd(dp[2]),
1086               dp0jweta   = _mm_mul_pd(dp0,jweta),dp1jweta = _mm_mul_pd(dp1,jweta),dp2jweta = _mm_mul_pd(dp2,jweta),
1087               p4du0p2dv1 = _mm_add_pd(_mm_mul_pd(p4,du0),_mm_mul_pd(p2,dv1)), /* 4 du0 + 2 dv1 */
1088               p4dv1p2du0 = _mm_add_pd(_mm_mul_pd(p4,dv1),_mm_mul_pd(p2,du0)), /* 4 dv1 + 2 du0 */
1089               pdu2dv2    = _mm_unpacklo_pd(du2,dv2),                          /* [du2, dv2] */
1090               du1pdv0    = _mm_add_pd(du1,dv0),                               /* du1 + dv0 */
1091               t1         = _mm_mul_pd(dp0,p4du0p2dv1),                        /* dp0 (4 du0 + 2 dv1) */
1092               t2         = _mm_mul_pd(dp1,p4dv1p2du0);                        /* dp1 (4 dv1 + 2 du0) */
1093 
1094 #endif
1095 #if defined COMPUTE_LOWER_TRIANGULAR  /* The element matrices are always symmetric so computing the lower-triangular part is not necessary */
1096             for (ll=ls; ll<8; ll++) { /* trial functions */
1097 #else
1098             for (ll=l; ll<8; ll++) {
1099 #endif
1100               const PetscReal *PETSC_RESTRICT dpl = dphi[ll];
1101               if (amode == THIASSEMBLY_TRIDIAGONAL && (l-ll)%4) continue; /* these entries would not be inserted */
1102 #if !USE_SSE2_KERNELS
1103               /* The analytic Jacobian in nice, easy-to-read form */
1104               {
1105                 PetscScalar dgdu,dgdv;
1106                 dgdu = 2.*du[0]*dpl[0] + dv[1]*dpl[0] + 0.5*(du[1]+dv[0])*dpl[1] + 0.5*du[2]*dpl[2];
1107                 dgdv = 2.*dv[1]*dpl[1] + du[0]*dpl[1] + 0.5*(du[1]+dv[0])*dpl[0] + 0.5*dv[2]*dpl[2];
1108                 /* Picard part */
1109                 Ke[l*2+0][ll*2+0] += dp[0]*jw*eta*4.*dpl[0] + dp[1]*jw*eta*dpl[1] + dp[2]*jw*eta*dpl[2];
1110                 Ke[l*2+0][ll*2+1] += dp[0]*jw*eta*2.*dpl[1] + dp[1]*jw*eta*dpl[0];
1111                 Ke[l*2+1][ll*2+0] += dp[1]*jw*eta*2.*dpl[0] + dp[0]*jw*eta*dpl[1];
1112                 Ke[l*2+1][ll*2+1] += dp[1]*jw*eta*4.*dpl[1] + dp[0]*jw*eta*dpl[0] + dp[2]*jw*eta*dpl[2];
1113                 /* extra Newton terms */
1114                 Ke[l*2+0][ll*2+0] += dp[0]*jw*deta*dgdu*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdu*(du[1]+dv[0]) + dp[2]*jw*deta*dgdu*du[2];
1115                 Ke[l*2+0][ll*2+1] += dp[0]*jw*deta*dgdv*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdv*(du[1]+dv[0]) + dp[2]*jw*deta*dgdv*du[2];
1116                 Ke[l*2+1][ll*2+0] += dp[1]*jw*deta*dgdu*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdu*(du[1]+dv[0]) + dp[2]*jw*deta*dgdu*dv[2];
1117                 Ke[l*2+1][ll*2+1] += dp[1]*jw*deta*dgdv*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdv*(du[1]+dv[0]) + dp[2]*jw*deta*dgdv*dv[2];
1118               }
1119 #else
1120               /* This SSE2 code is an exact replica of above, but uses explicit packed instructions for some speed
1121               * benefit.  On my hardware, these intrinsics are almost twice as fast as above, reducing total assembly cost
1122               * by 25 to 30 percent. */
1123               {
1124                 __m128d
1125                   keu   = _mm_loadu_pd(&Ke[l*2+0][ll*2+0]),
1126                   kev   = _mm_loadu_pd(&Ke[l*2+1][ll*2+0]),
1127                   dpl01 = _mm_loadu_pd(&dpl[0]),dpl10 = _mm_shuffle_pd(dpl01,dpl01,_MM_SHUFFLE2(0,1)),dpl2 = _mm_set_sd(dpl[2]),
1128                   t0,t3,pdgduv;
1129                 keu = _mm_add_pd(keu,_mm_add_pd(_mm_mul_pd(_mm_mul_pd(dp0jweta,p42),dpl01),
1130                                                 _mm_add_pd(_mm_mul_pd(dp1jweta,dpl10),
1131                                                            _mm_mul_pd(dp2jweta,dpl2))));
1132                 kev = _mm_add_pd(kev,_mm_add_pd(_mm_mul_pd(_mm_mul_pd(dp1jweta,p24),dpl01),
1133                                                 _mm_add_pd(_mm_mul_pd(dp0jweta,dpl10),
1134                                                            _mm_mul_pd(dp2jweta,_mm_shuffle_pd(dpl2,dpl2,_MM_SHUFFLE2(0,1))))));
1135                 pdgduv = _mm_mul_pd(p05,_mm_add_pd(_mm_add_pd(_mm_mul_pd(p42,_mm_mul_pd(du0,dpl01)),
1136                                                               _mm_mul_pd(p24,_mm_mul_pd(dv1,dpl01))),
1137                                                    _mm_add_pd(_mm_mul_pd(du1pdv0,dpl10),
1138                                                               _mm_mul_pd(pdu2dv2,_mm_set1_pd(dpl[2]))))); /* [dgdu, dgdv] */
1139                 t0 = _mm_mul_pd(jwdeta,pdgduv);  /* jw deta [dgdu, dgdv] */
1140                 t3 = _mm_mul_pd(t0,du1pdv0);     /* t0 (du1 + dv0) */
1141                 _mm_storeu_pd(&Ke[l*2+0][ll*2+0],_mm_add_pd(keu,_mm_add_pd(_mm_mul_pd(t1,t0),
1142                                                                            _mm_add_pd(_mm_mul_pd(dp1,t3),
1143                                                                                       _mm_mul_pd(t0,_mm_mul_pd(dp2,du2))))));
1144                 _mm_storeu_pd(&Ke[l*2+1][ll*2+0],_mm_add_pd(kev,_mm_add_pd(_mm_mul_pd(t2,t0),
1145                                                                            _mm_add_pd(_mm_mul_pd(dp0,t3),
1146                                                                                       _mm_mul_pd(t0,_mm_mul_pd(dp2,dv2))))));
1147               }
1148 #endif
1149             }
1150           }
1151         }
1152         if (k == 0) { /* on a bottom face */
1153           if (thi->no_slip) {
1154             const PetscReal   hz    = PetscRealPart(pn[0].h)/(zm-1);
1155             const PetscScalar diagu = 2*etabase/thi->rhog*(hx*hy/hz + hx*hz/hy + 4*hy*hz/hx),diagv = 2*etabase/thi->rhog*(hx*hy/hz + 4*hx*hz/hy + hy*hz/hx);
1156             Ke[0][0] = thi->dirichlet_scale*diagu;
1157             Ke[1][1] = thi->dirichlet_scale*diagv;
1158           } else {
1159             for (q=0; q<4; q++) {
1160               const PetscReal jw = 0.25*hx*hy/thi->rhog,*phi = QuadQInterp[q];
1161               PetscScalar     u  =0,v=0,rbeta2=0;
1162               PetscReal       beta2,dbeta2;
1163               for (l=0; l<4; l++) {
1164                 u      += phi[l]*n[l].u;
1165                 v      += phi[l]*n[l].v;
1166                 rbeta2 += phi[l]*pn[l].beta2;
1167               }
1168               THIFriction(thi,PetscRealPart(rbeta2),PetscRealPart(u*u+v*v)/2,&beta2,&dbeta2);
1169               for (l=0; l<4; l++) {
1170                 const PetscReal pp = phi[l];
1171                 for (ll=0; ll<4; ll++) {
1172                   const PetscReal ppl = phi[ll];
1173                   Ke[l*2+0][ll*2+0] += pp*jw*beta2*ppl + pp*jw*dbeta2*u*u*ppl;
1174                   Ke[l*2+0][ll*2+1] +=                   pp*jw*dbeta2*u*v*ppl;
1175                   Ke[l*2+1][ll*2+0] +=                   pp*jw*dbeta2*v*u*ppl;
1176                   Ke[l*2+1][ll*2+1] += pp*jw*beta2*ppl + pp*jw*dbeta2*v*v*ppl;
1177                 }
1178               }
1179             }
1180           }
1181         }
1182         {
1183           const MatStencil rc[8] = {{i,j,k,0},{i+1,j,k,0},{i+1,j+1,k,0},{i,j+1,k,0},{i,j,k+1,0},{i+1,j,k+1,0},{i+1,j+1,k+1,0},{i,j+1,k+1,0}};
1184           if (amode == THIASSEMBLY_TRIDIAGONAL) {
1185             for (l=0; l<4; l++) { /* Copy out each of the blocks, discarding horizontal coupling */
1186               const PetscInt   l4     = l+4;
1187               const MatStencil rcl[2] = {{rc[l].k,rc[l].j,rc[l].i,0},{rc[l4].k,rc[l4].j,rc[l4].i,0}};
1188 #if defined COMPUTE_LOWER_TRIANGULAR
1189               const PetscScalar Kel[4][4] = {{Ke[2*l+0][2*l+0] ,Ke[2*l+0][2*l+1] ,Ke[2*l+0][2*l4+0] ,Ke[2*l+0][2*l4+1]},
1190                                              {Ke[2*l+1][2*l+0] ,Ke[2*l+1][2*l+1] ,Ke[2*l+1][2*l4+0] ,Ke[2*l+1][2*l4+1]},
1191                                              {Ke[2*l4+0][2*l+0],Ke[2*l4+0][2*l+1],Ke[2*l4+0][2*l4+0],Ke[2*l4+0][2*l4+1]},
1192                                              {Ke[2*l4+1][2*l+0],Ke[2*l4+1][2*l+1],Ke[2*l4+1][2*l4+0],Ke[2*l4+1][2*l4+1]}};
1193 #else
1194               /* Same as above except for the lower-left block */
1195               const PetscScalar Kel[4][4] = {{Ke[2*l+0][2*l+0] ,Ke[2*l+0][2*l+1] ,Ke[2*l+0][2*l4+0] ,Ke[2*l+0][2*l4+1]},
1196                                              {Ke[2*l+1][2*l+0] ,Ke[2*l+1][2*l+1] ,Ke[2*l+1][2*l4+0] ,Ke[2*l+1][2*l4+1]},
1197                                              {Ke[2*l+0][2*l4+0],Ke[2*l+1][2*l4+0],Ke[2*l4+0][2*l4+0],Ke[2*l4+0][2*l4+1]},
1198                                              {Ke[2*l+0][2*l4+1],Ke[2*l+1][2*l4+1],Ke[2*l4+1][2*l4+0],Ke[2*l4+1][2*l4+1]}};
1199 #endif
1200               PetscCall(MatSetValuesBlockedStencil(B,2,rcl,2,rcl,&Kel[0][0],ADD_VALUES));
1201             }
1202           } else {
1203 #if !defined COMPUTE_LOWER_TRIANGULAR /* fill in lower-triangular part, this is really cheap compared to computing the entries */
1204             for (l=0; l<8; l++) {
1205               for (ll=l+1; ll<8; ll++) {
1206                 Ke[ll*2+0][l*2+0] = Ke[l*2+0][ll*2+0];
1207                 Ke[ll*2+1][l*2+0] = Ke[l*2+0][ll*2+1];
1208                 Ke[ll*2+0][l*2+1] = Ke[l*2+1][ll*2+0];
1209                 Ke[ll*2+1][l*2+1] = Ke[l*2+1][ll*2+1];
1210               }
1211             }
1212 #endif
1213             PetscCall(MatSetValuesBlockedStencil(B,8,rc,8,rc,&Ke[0][0],ADD_VALUES));
1214           }
1215         }
1216       }
1217     }
1218   }
1219   PetscCall(THIDARestorePrm(info->da,&prm));
1220 
1221   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1222   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1223   PetscCall(MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE));
1224   if (thi->verbose) PetscCall(THIMatrixStatistics(thi,B,PETSC_VIEWER_STDOUT_WORLD));
1225   PetscFunctionReturn(0);
1226 }
1227 
1228 static PetscErrorCode THIJacobianLocal_3D_Full(DMDALocalInfo *info,Node ***x,Mat A,Mat B,THI thi)
1229 {
1230   PetscFunctionBeginUser;
1231   PetscCall(THIJacobianLocal_3D(info,x,B,thi,THIASSEMBLY_FULL));
1232   PetscFunctionReturn(0);
1233 }
1234 
1235 static PetscErrorCode THIJacobianLocal_3D_Tridiagonal(DMDALocalInfo *info,Node ***x,Mat A,Mat B,THI thi)
1236 {
1237   PetscFunctionBeginUser;
1238   PetscCall(THIJacobianLocal_3D(info,x,B,thi,THIASSEMBLY_TRIDIAGONAL));
1239   PetscFunctionReturn(0);
1240 }
1241 
1242 static PetscErrorCode DMRefineHierarchy_THI(DM dac0,PetscInt nlevels,DM hierarchy[])
1243 {
1244   THI             thi;
1245   PetscInt        dim,M,N,m,n,s,dof;
1246   DM              dac,daf;
1247   DMDAStencilType st;
1248   DM_DA           *ddf,*ddc;
1249 
1250   PetscFunctionBeginUser;
1251   PetscCall(PetscObjectQuery((PetscObject)dac0,"THI",(PetscObject*)&thi));
1252   PetscCheck(thi,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot refine this DMDA, missing composed THI instance");
1253   if (nlevels > 1) {
1254     PetscCall(DMRefineHierarchy(dac0,nlevels-1,hierarchy));
1255     dac  = hierarchy[nlevels-2];
1256   } else {
1257     dac = dac0;
1258   }
1259   PetscCall(DMDAGetInfo(dac,&dim, &N,&M,0, &n,&m,0, &dof,&s,0,0,0,&st));
1260   PetscCheck(dim == 2,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"This function can only refine 2D DMDAs");
1261 
1262   /* Creates a 3D DMDA with the same map-plane layout as the 2D one, with contiguous columns */
1263   PetscCall(DMDACreate3d(PetscObjectComm((PetscObject)dac),DM_BOUNDARY_NONE,DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC,st,thi->zlevels,N,M,1,n,m,dof,s,NULL,NULL,NULL,&daf));
1264   PetscCall(DMSetUp(daf));
1265 
1266   daf->ops->creatematrix        = dac->ops->creatematrix;
1267   daf->ops->createinterpolation = dac->ops->createinterpolation;
1268   daf->ops->getcoloring         = dac->ops->getcoloring;
1269   ddf                           = (DM_DA*)daf->data;
1270   ddc                           = (DM_DA*)dac->data;
1271   ddf->interptype               = ddc->interptype;
1272 
1273   PetscCall(DMDASetFieldName(daf,0,"x-velocity"));
1274   PetscCall(DMDASetFieldName(daf,1,"y-velocity"));
1275 
1276   hierarchy[nlevels-1] = daf;
1277   PetscFunctionReturn(0);
1278 }
1279 
1280 static PetscErrorCode DMCreateInterpolation_DA_THI(DM dac,DM daf,Mat *A,Vec *scale)
1281 {
1282   PetscInt       dim;
1283 
1284   PetscFunctionBeginUser;
1285   PetscValidHeaderSpecific(dac,DM_CLASSID,1);
1286   PetscValidHeaderSpecific(daf,DM_CLASSID,2);
1287   PetscValidPointer(A,3);
1288   if (scale) PetscValidPointer(scale,4);
1289   PetscCall(DMDAGetInfo(daf,&dim,0,0,0,0,0,0,0,0,0,0,0,0));
1290   if (dim  == 2) {
1291     /* We are in the 2D problem and use normal DMDA interpolation */
1292     PetscCall(DMCreateInterpolation(dac,daf,A,scale));
1293   } else {
1294     PetscInt i,j,k,xs,ys,zs,xm,ym,zm,mx,my,mz,rstart,cstart;
1295     Mat      B;
1296 
1297     PetscCall(DMDAGetInfo(daf,0, &mz,&my,&mx, 0,0,0, 0,0,0,0,0,0));
1298     PetscCall(DMDAGetCorners(daf,&zs,&ys,&xs,&zm,&ym,&xm));
1299     PetscCheck(!zs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"unexpected");
1300     PetscCall(MatCreate(PetscObjectComm((PetscObject)daf),&B));
1301     PetscCall(MatSetSizes(B,xm*ym*zm,xm*ym,mx*my*mz,mx*my));
1302 
1303     PetscCall(MatSetType(B,MATAIJ));
1304     PetscCall(MatSeqAIJSetPreallocation(B,1,NULL));
1305     PetscCall(MatMPIAIJSetPreallocation(B,1,NULL,0,NULL));
1306     PetscCall(MatGetOwnershipRange(B,&rstart,NULL));
1307     PetscCall(MatGetOwnershipRangeColumn(B,&cstart,NULL));
1308     for (i=xs; i<xs+xm; i++) {
1309       for (j=ys; j<ys+ym; j++) {
1310         for (k=zs; k<zs+zm; k++) {
1311           PetscInt    i2  = i*ym+j,i3 = i2*zm+k;
1312           PetscScalar val = ((k == 0 || k == mz-1) ? 0.5 : 1.) / (mz-1.); /* Integration using trapezoid rule */
1313           PetscCall(MatSetValue(B,cstart+i3,rstart+i2,val,INSERT_VALUES));
1314         }
1315       }
1316     }
1317     PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1318     PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1319     PetscCall(MatCreateMAIJ(B,sizeof(Node)/sizeof(PetscScalar),A));
1320     PetscCall(MatDestroy(&B));
1321   }
1322   PetscFunctionReturn(0);
1323 }
1324 
1325 static PetscErrorCode DMCreateMatrix_THI_Tridiagonal(DM da,Mat *J)
1326 {
1327   Mat                    A;
1328   PetscInt               xm,ym,zm,dim,dof = 2,starts[3],dims[3];
1329   ISLocalToGlobalMapping ltog;
1330 
1331   PetscFunctionBeginUser;
1332   PetscCall(DMDAGetInfo(da,&dim, 0,0,0, 0,0,0, 0,0,0,0,0,0));
1333   PetscCheck(dim == 3,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Expected DMDA to be 3D");
1334   PetscCall(DMDAGetCorners(da,0,0,0,&zm,&ym,&xm));
1335   PetscCall(DMGetLocalToGlobalMapping(da,&ltog));
1336   PetscCall(MatCreate(PetscObjectComm((PetscObject)da),&A));
1337   PetscCall(MatSetSizes(A,dof*xm*ym*zm,dof*xm*ym*zm,PETSC_DETERMINE,PETSC_DETERMINE));
1338   PetscCall(MatSetType(A,da->mattype));
1339   PetscCall(MatSetFromOptions(A));
1340   PetscCall(MatSeqAIJSetPreallocation(A,3*2,NULL));
1341   PetscCall(MatMPIAIJSetPreallocation(A,3*2,NULL,0,NULL));
1342   PetscCall(MatSeqBAIJSetPreallocation(A,2,3,NULL));
1343   PetscCall(MatMPIBAIJSetPreallocation(A,2,3,NULL,0,NULL));
1344   PetscCall(MatSeqSBAIJSetPreallocation(A,2,2,NULL));
1345   PetscCall(MatMPISBAIJSetPreallocation(A,2,2,NULL,0,NULL));
1346   PetscCall(MatSetLocalToGlobalMapping(A,ltog,ltog));
1347   PetscCall(DMDAGetGhostCorners(da,&starts[0],&starts[1],&starts[2],&dims[0],&dims[1],&dims[2]));
1348   PetscCall(MatSetStencil(A,dim,dims,starts,dof));
1349   *J   = A;
1350   PetscFunctionReturn(0);
1351 }
1352 
1353 static PetscErrorCode THIDAVecView_VTK_XML(THI thi,DM da,Vec X,const char filename[])
1354 {
1355   const PetscInt    dof   = 2;
1356   Units             units = thi->units;
1357   MPI_Comm          comm;
1358   PetscViewer       viewer;
1359   PetscMPIInt       rank,size,tag,nn,nmax;
1360   PetscInt          mx,my,mz,r,range[6];
1361   const PetscScalar *x;
1362 
1363   PetscFunctionBeginUser;
1364   PetscCall(PetscObjectGetComm((PetscObject)thi,&comm));
1365   PetscCall(DMDAGetInfo(da,0, &mz,&my,&mx, 0,0,0, 0,0,0,0,0,0));
1366   PetscCallMPI(MPI_Comm_size(comm,&size));
1367   PetscCallMPI(MPI_Comm_rank(comm,&rank));
1368   PetscCall(PetscViewerASCIIOpen(comm,filename,&viewer));
1369   PetscCall(PetscViewerASCIIPrintf(viewer,"<VTKFile type=\"StructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n"));
1370   PetscCall(PetscViewerASCIIPrintf(viewer,"  <StructuredGrid WholeExtent=\"%d %D %d %D %d %D\">\n",0,mz-1,0,my-1,0,mx-1));
1371 
1372   PetscCall(DMDAGetCorners(da,range,range+1,range+2,range+3,range+4,range+5));
1373   PetscCall(PetscMPIIntCast(range[3]*range[4]*range[5]*dof,&nn));
1374   PetscCallMPI(MPI_Reduce(&nn,&nmax,1,MPI_INT,MPI_MAX,0,comm));
1375   tag  = ((PetscObject) viewer)->tag;
1376   PetscCall(VecGetArrayRead(X,&x));
1377   if (rank == 0) {
1378     PetscScalar *array;
1379     PetscCall(PetscMalloc1(nmax,&array));
1380     for (r=0; r<size; r++) {
1381       PetscInt          i,j,k,xs,xm,ys,ym,zs,zm;
1382       const PetscScalar *ptr;
1383       MPI_Status        status;
1384       if (r) {
1385         PetscCallMPI(MPI_Recv(range,6,MPIU_INT,r,tag,comm,MPI_STATUS_IGNORE));
1386       }
1387       zs = range[0];ys = range[1];xs = range[2];zm = range[3];ym = range[4];xm = range[5];
1388       PetscCheck(xm*ym*zm*dof <= nmax,PETSC_COMM_SELF,PETSC_ERR_PLIB,"should not happen");
1389       if (r) {
1390         PetscCallMPI(MPI_Recv(array,nmax,MPIU_SCALAR,r,tag,comm,&status));
1391         PetscCallMPI(MPI_Get_count(&status,MPIU_SCALAR,&nn));
1392         PetscCheck(nn == xm*ym*zm*dof,PETSC_COMM_SELF,PETSC_ERR_PLIB,"should not happen");
1393         ptr = array;
1394       } else ptr = x;
1395       PetscCall(PetscViewerASCIIPrintf(viewer,"    <Piece Extent=\"%D %D %D %D %D %D\">\n",zs,zs+zm-1,ys,ys+ym-1,xs,xs+xm-1));
1396 
1397       PetscCall(PetscViewerASCIIPrintf(viewer,"      <Points>\n"));
1398       PetscCall(PetscViewerASCIIPrintf(viewer,"        <DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"ascii\">\n"));
1399       for (i=xs; i<xs+xm; i++) {
1400         for (j=ys; j<ys+ym; j++) {
1401           for (k=zs; k<zs+zm; k++) {
1402             PrmNode   p;
1403             PetscReal xx = thi->Lx*i/mx,yy = thi->Ly*j/my,zz;
1404             thi->initialize(thi,xx,yy,&p);
1405             zz   = PetscRealPart(p.b) + PetscRealPart(p.h)*k/(mz-1);
1406             PetscCall(PetscViewerASCIIPrintf(viewer,"%f %f %f\n",(double)xx,(double)yy,(double)zz));
1407           }
1408         }
1409       }
1410       PetscCall(PetscViewerASCIIPrintf(viewer,"        </DataArray>\n"));
1411       PetscCall(PetscViewerASCIIPrintf(viewer,"      </Points>\n"));
1412 
1413       PetscCall(PetscViewerASCIIPrintf(viewer,"      <PointData>\n"));
1414       PetscCall(PetscViewerASCIIPrintf(viewer,"        <DataArray type=\"Float32\" Name=\"velocity\" NumberOfComponents=\"3\" format=\"ascii\">\n"));
1415       for (i=0; i<nn; i+=dof) {
1416         PetscCall(PetscViewerASCIIPrintf(viewer,"%f %f %f\n",(double)(PetscRealPart(ptr[i])*units->year/units->meter),(double)(PetscRealPart(ptr[i+1])*units->year/units->meter),0.0));
1417       }
1418       PetscCall(PetscViewerASCIIPrintf(viewer,"        </DataArray>\n"));
1419 
1420       PetscCall(PetscViewerASCIIPrintf(viewer,"        <DataArray type=\"Int32\" Name=\"rank\" NumberOfComponents=\"1\" format=\"ascii\">\n"));
1421       for (i=0; i<nn; i+=dof) {
1422         PetscCall(PetscViewerASCIIPrintf(viewer,"%D\n",r));
1423       }
1424       PetscCall(PetscViewerASCIIPrintf(viewer,"        </DataArray>\n"));
1425       PetscCall(PetscViewerASCIIPrintf(viewer,"      </PointData>\n"));
1426 
1427       PetscCall(PetscViewerASCIIPrintf(viewer,"    </Piece>\n"));
1428     }
1429     PetscCall(PetscFree(array));
1430   } else {
1431     PetscCallMPI(MPI_Send(range,6,MPIU_INT,0,tag,comm));
1432     PetscCallMPI(MPI_Send((PetscScalar*)x,nn,MPIU_SCALAR,0,tag,comm));
1433   }
1434   PetscCall(VecRestoreArrayRead(X,&x));
1435   PetscCall(PetscViewerASCIIPrintf(viewer,"  </StructuredGrid>\n"));
1436   PetscCall(PetscViewerASCIIPrintf(viewer,"</VTKFile>\n"));
1437   PetscCall(PetscViewerDestroy(&viewer));
1438   PetscFunctionReturn(0);
1439 }
1440 
1441 int main(int argc,char *argv[])
1442 {
1443   MPI_Comm       comm;
1444   THI            thi;
1445   PetscErrorCode ierr;
1446   DM             da;
1447   SNES           snes;
1448 
1449   PetscCall(PetscInitialize(&argc,&argv,0,help));
1450   comm = PETSC_COMM_WORLD;
1451 
1452   PetscCall(THICreate(comm,&thi));
1453   {
1454     PetscInt M = 3,N = 3,P = 2;
1455     ierr = PetscOptionsBegin(comm,NULL,"Grid resolution options","");PetscCall(ierr);
1456     {
1457       PetscCall(PetscOptionsInt("-M","Number of elements in x-direction on coarse level","",M,&M,NULL));
1458       N    = M;
1459       PetscCall(PetscOptionsInt("-N","Number of elements in y-direction on coarse level (if different from M)","",N,&N,NULL));
1460       if (thi->coarse2d) {
1461         PetscCall(PetscOptionsInt("-zlevels","Number of elements in z-direction on fine level","",thi->zlevels,&thi->zlevels,NULL));
1462       } else {
1463         PetscCall(PetscOptionsInt("-P","Number of elements in z-direction on coarse level","",P,&P,NULL));
1464       }
1465     }
1466     ierr = PetscOptionsEnd();PetscCall(ierr);
1467     if (thi->coarse2d) {
1468       PetscCall(DMDACreate2d(comm,DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC,DMDA_STENCIL_BOX,N,M,PETSC_DETERMINE,PETSC_DETERMINE,sizeof(Node)/sizeof(PetscScalar),1,0,0,&da));
1469       PetscCall(DMSetFromOptions(da));
1470       PetscCall(DMSetUp(da));
1471       da->ops->refinehierarchy     = DMRefineHierarchy_THI;
1472       da->ops->createinterpolation = DMCreateInterpolation_DA_THI;
1473 
1474       PetscCall(PetscObjectCompose((PetscObject)da,"THI",(PetscObject)thi));
1475     } else {
1476       PetscCall(DMDACreate3d(comm,DM_BOUNDARY_NONE,DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC, DMDA_STENCIL_BOX,P,N,M,1,PETSC_DETERMINE,PETSC_DETERMINE,sizeof(Node)/sizeof(PetscScalar),1,0,0,0,&da));
1477       PetscCall(DMSetFromOptions(da));
1478       PetscCall(DMSetUp(da));
1479     }
1480     PetscCall(DMDASetFieldName(da,0,"x-velocity"));
1481     PetscCall(DMDASetFieldName(da,1,"y-velocity"));
1482   }
1483   PetscCall(THISetUpDM(thi,da));
1484   if (thi->tridiagonal) da->ops->creatematrix = DMCreateMatrix_THI_Tridiagonal;
1485 
1486   {                             /* Set the fine level matrix type if -da_refine */
1487     PetscInt rlevel,clevel;
1488     PetscCall(DMGetRefineLevel(da,&rlevel));
1489     PetscCall(DMGetCoarsenLevel(da,&clevel));
1490     if (rlevel - clevel > 0) PetscCall(DMSetMatType(da,thi->mattype));
1491   }
1492 
1493   PetscCall(DMDASNESSetFunctionLocal(da,ADD_VALUES,(DMDASNESFunction)THIFunctionLocal,thi));
1494   if (thi->tridiagonal) {
1495     PetscCall(DMDASNESSetJacobianLocal(da,(DMDASNESJacobian)THIJacobianLocal_3D_Tridiagonal,thi));
1496   } else {
1497     PetscCall(DMDASNESSetJacobianLocal(da,(DMDASNESJacobian)THIJacobianLocal_3D_Full,thi));
1498   }
1499   PetscCall(DMCoarsenHookAdd(da,DMCoarsenHook_THI,NULL,thi));
1500   PetscCall(DMRefineHookAdd(da,DMRefineHook_THI,NULL,thi));
1501 
1502   PetscCall(DMSetApplicationContext(da,thi));
1503 
1504   PetscCall(SNESCreate(comm,&snes));
1505   PetscCall(SNESSetDM(snes,da));
1506   PetscCall(DMDestroy(&da));
1507   PetscCall(SNESSetComputeInitialGuess(snes,THIInitial,NULL));
1508   PetscCall(SNESSetFromOptions(snes));
1509 
1510   PetscCall(SNESSolve(snes,NULL,NULL));
1511 
1512   PetscCall(THISolveStatistics(thi,snes,0,"Full"));
1513 
1514   {
1515     PetscBool flg;
1516     char      filename[PETSC_MAX_PATH_LEN] = "";
1517     PetscCall(PetscOptionsGetString(NULL,NULL,"-o",filename,sizeof(filename),&flg));
1518     if (flg) {
1519       Vec X;
1520       DM  dm;
1521       PetscCall(SNESGetSolution(snes,&X));
1522       PetscCall(SNESGetDM(snes,&dm));
1523       PetscCall(THIDAVecView_VTK_XML(thi,dm,X,filename));
1524     }
1525   }
1526 
1527   PetscCall(DMDestroy(&da));
1528   PetscCall(SNESDestroy(&snes));
1529   PetscCall(THIDestroy(&thi));
1530   PetscCall(PetscFinalize());
1531   return 0;
1532 }
1533 
1534 /*TEST
1535 
1536    build:
1537       requires: !single
1538 
1539    test:
1540       args: -M 6 -P 4 -da_refine 1 -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type sbaij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type icc
1541 
1542    test:
1543       suffix: 2
1544       nsize: 2
1545       args: -M 6 -P 4 -thi_hom z -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type sbaij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type asm -mg_levels_pc_asm_blocks 6 -mg_levels_0_pc_type redundant -snes_grid_sequence 1 -mat_partitioning_type current -ksp_atol -1
1546 
1547    test:
1548       suffix: 3
1549       nsize: 3
1550       args: -M 7 -P 4 -thi_hom z -da_refine 1 -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type baij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_pc_asm_type restrict -mg_levels_pc_type asm -mg_levels_pc_asm_blocks 9 -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mat_partitioning_type current
1551 
1552    test:
1553       suffix: 4
1554       nsize: 6
1555       args: -M 4 -P 2 -da_refine_hierarchy_x 1,1,3 -da_refine_hierarchy_y 2,2,1 -da_refine_hierarchy_z 2,2,1 -snes_grid_sequence 3 -ksp_converged_reason -ksp_type fgmres -ksp_rtol 1e-2 -pc_type mg -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type bjacobi -mg_levels_1_sub_pc_type cholesky -pc_mg_type multiplicative -snes_converged_reason -snes_stol 1e-12 -thi_L 80e3 -thi_alpha 0.05 -thi_friction_m 1 -thi_hom x -snes_view -mg_levels_0_pc_type redundant -mg_levels_0_ksp_type preonly -ksp_atol -1
1556 
1557    test:
1558       suffix: 5
1559       nsize: 6
1560       args: -M 12 -P 5 -snes_monitor_short -ksp_converged_reason -pc_type asm -pc_asm_type restrict -dm_mat_type {{aij baij sbaij}}
1561 
1562 TEST*/
1563