xref: /petsc/src/snes/tutorials/ex48.c (revision a69119a591a03a9d906b29c0a4e9802e4d7c9795)
1 static const char help[] = "Toy hydrostatic ice flow with multigrid in 3D.\n\
2 \n\
3 Solves the hydrostatic (aka Blatter/Pattyn/First Order) equations for ice sheet flow\n\
4 using multigrid.  The ice uses a power-law rheology with \"Glen\" exponent 3 (corresponds\n\
5 to p=4/3 in a p-Laplacian).  The focus is on ISMIP-HOM experiments which assume periodic\n\
6 boundary conditions in the x- and y-directions.\n\
7 \n\
8 Equations are rescaled so that the domain size and solution are O(1), details of this scaling\n\
9 can be controlled by the options -units_meter, -units_second, and -units_kilogram.\n\
10 \n\
11 A VTK StructuredGrid output file can be written using the option -o filename.vts\n\
12 \n\n";
13 
14 /*
15 The equations for horizontal velocity (u,v) are
16 
17   - [eta (4 u_x + 2 v_y)]_x - [eta (u_y + v_x)]_y - [eta u_z]_z + rho g s_x = 0
18   - [eta (4 v_y + 2 u_x)]_y - [eta (u_y + v_x)]_x - [eta v_z]_z + rho g s_y = 0
19 
20 where
21 
22   eta = B/2 (epsilon + gamma)^((p-2)/2)
23 
24 is the nonlinear effective viscosity with regularization epsilon and hardness parameter B,
25 written in terms of the second invariant
26 
27   gamma = u_x^2 + v_y^2 + u_x v_y + (1/4) (u_y + v_x)^2 + (1/4) u_z^2 + (1/4) v_z^2
28 
29 The surface boundary conditions are the natural conditions.  The basal boundary conditions
30 are either no-slip, or Navier (linear) slip with spatially variant friction coefficient beta^2.
31 
32 In the code, the equations for (u,v) are multiplied through by 1/(rho g) so that residuals are O(1).
33 
34 The discretization is Q1 finite elements, managed by a DMDA.  The grid is never distorted in the
35 map (x,y) plane, but the bed and surface may be bumpy.  This is handled as usual in FEM, through
36 the Jacobian of the coordinate transformation from a reference element to the physical element.
37 
38 Since ice-flow is tightly coupled in the z-direction (within columns), the DMDA is managed
39 specially so that columns are never distributed, and are always contiguous in memory.
40 This amounts to reversing the meaning of X,Y,Z compared to the DMDA's internal interpretation,
41 and then indexing as vec[i][j][k].  The exotic coarse spaces require 2D DMDAs which are made to
42 use compatible domain decomposition relative to the 3D DMDAs.
43 
44 There are two compile-time options:
45 
46   NO_SSE2:
47     If the host supports SSE2, we use integration code that has been vectorized with SSE2
48     intrinsics, unless this macro is defined.  The intrinsics speed up integration by about
49     30% on my architecture (P8700, gcc-4.5 snapshot).
50 
51   COMPUTE_LOWER_TRIANGULAR:
52     The element matrices we assemble are lower-triangular so it is not necessary to compute
53     all entries explicitly.  If this macro is defined, the lower-triangular entries are
54     computed explicitly.
55 
56 */
57 
58 #if defined(PETSC_APPLE_FRAMEWORK)
59 #import <PETSc/petscsnes.h>
60 #import <PETSc/petsc/private/dmdaimpl.h> /* There is not yet a public interface to manipulate dm->ops */
61 #else
62 
63 #include <petscsnes.h>
64 #include <petsc/private/dmdaimpl.h> /* There is not yet a public interface to manipulate dm->ops */
65 #endif
66 #include <ctype.h> /* toupper() */
67 
68 #if defined(__cplusplus) || defined(PETSC_HAVE_WINDOWS_COMPILERS) || defined(__PGI)
69 /*  c++ cannot handle  [_restrict_] notation like C does */
70 #undef PETSC_RESTRICT
71 #define PETSC_RESTRICT
72 #endif
73 
74 #if defined __SSE2__
75 #include <emmintrin.h>
76 #endif
77 
78 /* The SSE2 kernels are only for PetscScalar=double on architectures that support it */
79 #if !defined NO_SSE2 && !defined PETSC_USE_COMPLEX && !defined PETSC_USE_REAL_SINGLE && !defined PETSC_USE_REAL___FLOAT128 && !defined PETSC_USE_REAL___FP16 && defined __SSE2__
80 #define USE_SSE2_KERNELS 1
81 #else
82 #define USE_SSE2_KERNELS 0
83 #endif
84 
85 static PetscClassId THI_CLASSID;
86 
87 typedef enum {
88   QUAD_GAUSS,
89   QUAD_LOBATTO
90 } QuadratureType;
91 static const char                  *QuadratureTypes[] = {"gauss", "lobatto", "QuadratureType", "QUAD_", 0};
92 PETSC_UNUSED static const PetscReal HexQWeights[8]    = {1, 1, 1, 1, 1, 1, 1, 1};
93 PETSC_UNUSED static const PetscReal HexQNodes[]       = {-0.57735026918962573, 0.57735026918962573};
94 #define G 0.57735026918962573
95 #define H (0.5 * (1. + G))
96 #define L (0.5 * (1. - G))
97 #define M (-0.5)
98 #define P (0.5)
99 /* Special quadrature: Lobatto in horizontal, Gauss in vertical */
100 static const PetscReal HexQInterp_Lobatto[8][8] = {
101   {H, 0, 0, 0, L, 0, 0, 0},
102   {0, H, 0, 0, 0, L, 0, 0},
103   {0, 0, H, 0, 0, 0, L, 0},
104   {0, 0, 0, H, 0, 0, 0, L},
105   {L, 0, 0, 0, H, 0, 0, 0},
106   {0, L, 0, 0, 0, H, 0, 0},
107   {0, 0, L, 0, 0, 0, H, 0},
108   {0, 0, 0, L, 0, 0, 0, H}
109 };
110 static const PetscReal HexQDeriv_Lobatto[8][8][3] = {
111   {{M * H, M *H, M}, {P * H, 0, 0},    {0, 0, 0},        {0, P *H, 0},     {M * L, M *L, P}, {P * L, 0, 0},    {0, 0, 0},        {0, P *L, 0}    },
112   {{M * H, 0, 0},    {P * H, M *H, M}, {0, P *H, 0},     {0, 0, 0},        {M * L, 0, 0},    {P * L, M *L, P}, {0, P *L, 0},     {0, 0, 0}       },
113   {{0, 0, 0},        {0, M *H, 0},     {P * H, P *H, M}, {M * H, 0, 0},    {0, 0, 0},        {0, M *L, 0},     {P * L, P *L, P}, {M * L, 0, 0}   },
114   {{0, M *H, 0},     {0, 0, 0},        {P * H, 0, 0},    {M * H, P *H, M}, {0, M *L, 0},     {0, 0, 0},        {P * L, 0, 0},    {M * L, P *L, P}},
115   {{M * L, M *L, M}, {P * L, 0, 0},    {0, 0, 0},        {0, P *L, 0},     {M * H, M *H, P}, {P * H, 0, 0},    {0, 0, 0},        {0, P *H, 0}    },
116   {{M * L, 0, 0},    {P * L, M *L, M}, {0, P *L, 0},     {0, 0, 0},        {M * H, 0, 0},    {P * H, M *H, P}, {0, P *H, 0},     {0, 0, 0}       },
117   {{0, 0, 0},        {0, M *L, 0},     {P * L, P *L, M}, {M * L, 0, 0},    {0, 0, 0},        {0, M *H, 0},     {P * H, P *H, P}, {M * H, 0, 0}   },
118   {{0, M *L, 0},     {0, 0, 0},        {P * L, 0, 0},    {M * L, P *L, M}, {0, M *H, 0},     {0, 0, 0},        {P * H, 0, 0},    {M * H, P *H, P}}
119 };
120 /* Stanndard Gauss */
121 static const PetscReal HexQInterp_Gauss[8][8] = {
122   {H * H * H, L *H *H, L *L *H, H *L *H, H *H *L, L *H *L, L *L *L, H *L *L},
123   {L * H * H, H *H *H, H *L *H, L *L *H, L *H *L, H *H *L, H *L *L, L *L *L},
124   {L * L * H, H *L *H, H *H *H, L *H *H, L *L *L, H *L *L, H *H *L, L *H *L},
125   {H * L * H, L *L *H, L *H *H, H *H *H, H *L *L, L *L *L, L *H *L, H *H *L},
126   {H * H * L, L *H *L, L *L *L, H *L *L, H *H *H, L *H *H, L *L *H, H *L *H},
127   {L * H * L, H *H *L, H *L *L, L *L *L, L *H *H, H *H *H, H *L *H, L *L *H},
128   {L * L * L, H *L *L, H *H *L, L *H *L, L *L *H, H *L *H, H *H *H, L *H *H},
129   {H * L * L, L *L *L, L *H *L, H *H *L, H *L *H, L *L *H, L *H *H, H *H *H}
130 };
131 static const PetscReal HexQDeriv_Gauss[8][8][3] = {
132   {{M * H * H, H *M *H, H *H *M}, {P * H * H, L *M *H, L *H *M}, {P * L * H, L *P *H, L *L *M}, {M * L * H, H *P *H, H *L *M}, {M * H * L, H *M *L, H *H *P}, {P * H * L, L *M *L, L *H *P}, {P * L * L, L *P *L, L *L *P}, {M * L * L, H *P *L, H *L *P}},
133   {{M * H * H, L *M *H, L *H *M}, {P * H * H, H *M *H, H *H *M}, {P * L * H, H *P *H, H *L *M}, {M * L * H, L *P *H, L *L *M}, {M * H * L, L *M *L, L *H *P}, {P * H * L, H *M *L, H *H *P}, {P * L * L, H *P *L, H *L *P}, {M * L * L, L *P *L, L *L *P}},
134   {{M * L * H, L *M *H, L *L *M}, {P * L * H, H *M *H, H *L *M}, {P * H * H, H *P *H, H *H *M}, {M * H * H, L *P *H, L *H *M}, {M * L * L, L *M *L, L *L *P}, {P * L * L, H *M *L, H *L *P}, {P * H * L, H *P *L, H *H *P}, {M * H * L, L *P *L, L *H *P}},
135   {{M * L * H, H *M *H, H *L *M}, {P * L * H, L *M *H, L *L *M}, {P * H * H, L *P *H, L *H *M}, {M * H * H, H *P *H, H *H *M}, {M * L * L, H *M *L, H *L *P}, {P * L * L, L *M *L, L *L *P}, {P * H * L, L *P *L, L *H *P}, {M * H * L, H *P *L, H *H *P}},
136   {{M * H * L, H *M *L, H *H *M}, {P * H * L, L *M *L, L *H *M}, {P * L * L, L *P *L, L *L *M}, {M * L * L, H *P *L, H *L *M}, {M * H * H, H *M *H, H *H *P}, {P * H * H, L *M *H, L *H *P}, {P * L * H, L *P *H, L *L *P}, {M * L * H, H *P *H, H *L *P}},
137   {{M * H * L, L *M *L, L *H *M}, {P * H * L, H *M *L, H *H *M}, {P * L * L, H *P *L, H *L *M}, {M * L * L, L *P *L, L *L *M}, {M * H * H, L *M *H, L *H *P}, {P * H * H, H *M *H, H *H *P}, {P * L * H, H *P *H, H *L *P}, {M * L * H, L *P *H, L *L *P}},
138   {{M * L * L, L *M *L, L *L *M}, {P * L * L, H *M *L, H *L *M}, {P * H * L, H *P *L, H *H *M}, {M * H * L, L *P *L, L *H *M}, {M * L * H, L *M *H, L *L *P}, {P * L * H, H *M *H, H *L *P}, {P * H * H, H *P *H, H *H *P}, {M * H * H, L *P *H, L *H *P}},
139   {{M * L * L, H *M *L, H *L *M}, {P * L * L, L *M *L, L *L *M}, {P * H * L, L *P *L, L *H *M}, {M * H * L, H *P *L, H *H *M}, {M * L * H, H *M *H, H *L *P}, {P * L * H, L *M *H, L *L *P}, {P * H * H, L *P *H, L *H *P}, {M * H * H, H *P *H, H *H *P}}
140 };
141 static const PetscReal (*HexQInterp)[8], (*HexQDeriv)[8][3];
142 /* Standard 2x2 Gauss quadrature for the bottom layer. */
143 static const PetscReal QuadQInterp[4][4] = {
144   {H * H, L *H, L *L, H *L},
145   {L * H, H *H, H *L, L *L},
146   {L * L, H *L, H *H, L *H},
147   {H * L, L *L, L *H, H *H}
148 };
149 static const PetscReal QuadQDeriv[4][4][2] = {
150   {{M * H, M *H}, {P * H, M *L}, {P * L, P *L}, {M * L, P *H}},
151   {{M * H, M *L}, {P * H, M *H}, {P * L, P *H}, {M * L, P *L}},
152   {{M * L, M *L}, {P * L, M *H}, {P * H, P *H}, {M * H, P *L}},
153   {{M * L, M *H}, {P * L, M *L}, {P * H, P *L}, {M * H, P *H}}
154 };
155 #undef G
156 #undef H
157 #undef L
158 #undef M
159 #undef P
160 
161 #define HexExtract(x, i, j, k, n) \
162   do { \
163     (n)[0] = (x)[i][j][k]; \
164     (n)[1] = (x)[i + 1][j][k]; \
165     (n)[2] = (x)[i + 1][j + 1][k]; \
166     (n)[3] = (x)[i][j + 1][k]; \
167     (n)[4] = (x)[i][j][k + 1]; \
168     (n)[5] = (x)[i + 1][j][k + 1]; \
169     (n)[6] = (x)[i + 1][j + 1][k + 1]; \
170     (n)[7] = (x)[i][j + 1][k + 1]; \
171   } while (0)
172 
173 #define HexExtractRef(x, i, j, k, n) \
174   do { \
175     (n)[0] = &(x)[i][j][k]; \
176     (n)[1] = &(x)[i + 1][j][k]; \
177     (n)[2] = &(x)[i + 1][j + 1][k]; \
178     (n)[3] = &(x)[i][j + 1][k]; \
179     (n)[4] = &(x)[i][j][k + 1]; \
180     (n)[5] = &(x)[i + 1][j][k + 1]; \
181     (n)[6] = &(x)[i + 1][j + 1][k + 1]; \
182     (n)[7] = &(x)[i][j + 1][k + 1]; \
183   } while (0)
184 
185 #define QuadExtract(x, i, j, n) \
186   do { \
187     (n)[0] = (x)[i][j]; \
188     (n)[1] = (x)[i + 1][j]; \
189     (n)[2] = (x)[i + 1][j + 1]; \
190     (n)[3] = (x)[i][j + 1]; \
191   } while (0)
192 
193 static void HexGrad(const PetscReal dphi[][3], const PetscReal zn[], PetscReal dz[]) {
194   PetscInt i;
195   dz[0] = dz[1] = dz[2] = 0;
196   for (i = 0; i < 8; i++) {
197     dz[0] += dphi[i][0] * zn[i];
198     dz[1] += dphi[i][1] * zn[i];
199     dz[2] += dphi[i][2] * zn[i];
200   }
201 }
202 
203 static void HexComputeGeometry(PetscInt q, PetscReal hx, PetscReal hy, const PetscReal dz[PETSC_RESTRICT], PetscReal phi[PETSC_RESTRICT], PetscReal dphi[PETSC_RESTRICT][3], PetscReal *PETSC_RESTRICT jw) {
204   const PetscReal jac[3][3] = {
205     {hx / 2, 0,      0    },
206     {0,      hy / 2, 0    },
207     {dz[0],  dz[1],  dz[2]}
208   };
209   const PetscReal ijac[3][3] = {
210     {1 / jac[0][0],                        0,                                    0            },
211     {0,                                    1 / jac[1][1],                        0            },
212     {-jac[2][0] / (jac[0][0] * jac[2][2]), -jac[2][1] / (jac[1][1] * jac[2][2]), 1 / jac[2][2]}
213   };
214   const PetscReal jdet = jac[0][0] * jac[1][1] * jac[2][2];
215   PetscInt        i;
216 
217   for (i = 0; i < 8; i++) {
218     const PetscReal *dphir = HexQDeriv[q][i];
219     phi[i]                 = HexQInterp[q][i];
220     dphi[i][0]             = dphir[0] * ijac[0][0] + dphir[1] * ijac[1][0] + dphir[2] * ijac[2][0];
221     dphi[i][1]             = dphir[0] * ijac[0][1] + dphir[1] * ijac[1][1] + dphir[2] * ijac[2][1];
222     dphi[i][2]             = dphir[0] * ijac[0][2] + dphir[1] * ijac[1][2] + dphir[2] * ijac[2][2];
223   }
224   *jw = 1.0 * jdet;
225 }
226 
227 typedef struct _p_THI   *THI;
228 typedef struct _n_Units *Units;
229 
230 typedef struct {
231   PetscScalar u, v;
232 } Node;
233 
234 typedef struct {
235   PetscScalar b;     /* bed */
236   PetscScalar h;     /* thickness */
237   PetscScalar beta2; /* friction */
238 } PrmNode;
239 
240 typedef struct {
241   PetscReal min, max, cmin, cmax;
242 } PRange;
243 
244 typedef enum {
245   THIASSEMBLY_TRIDIAGONAL,
246   THIASSEMBLY_FULL
247 } THIAssemblyMode;
248 
249 struct _p_THI {
250   PETSCHEADER(int);
251   void (*initialize)(THI, PetscReal x, PetscReal y, PrmNode *p);
252   PetscInt  zlevels;
253   PetscReal Lx, Ly, Lz; /* Model domain */
254   PetscReal alpha;      /* Bed angle */
255   Units     units;
256   PetscReal dirichlet_scale;
257   PetscReal ssa_friction_scale;
258   PRange    eta;
259   PRange    beta2;
260   struct {
261     PetscReal Bd2, eps, exponent;
262   } viscosity;
263   struct {
264     PetscReal irefgam, eps2, exponent, refvel, epsvel;
265   } friction;
266   PetscReal rhog;
267   PetscBool no_slip;
268   PetscBool tridiagonal;
269   PetscBool coarse2d;
270   PetscBool verbose;
271   MatType   mattype;
272 };
273 
274 struct _n_Units {
275   /* fundamental */
276   PetscReal meter;
277   PetscReal kilogram;
278   PetscReal second;
279   /* derived */
280   PetscReal Pascal;
281   PetscReal year;
282 };
283 
284 static PetscErrorCode THIJacobianLocal_3D_Full(DMDALocalInfo *, Node ***, Mat, Mat, THI);
285 static PetscErrorCode THIJacobianLocal_3D_Tridiagonal(DMDALocalInfo *, Node ***, Mat, Mat, THI);
286 static PetscErrorCode THIJacobianLocal_2D(DMDALocalInfo *, Node **, Mat, Mat, THI);
287 
288 static void PrmHexGetZ(const PrmNode pn[], PetscInt k, PetscInt zm, PetscReal zn[]) {
289   const PetscScalar zm1 = zm - 1, znl[8] = {pn[0].b + pn[0].h * (PetscScalar)k / zm1,       pn[1].b + pn[1].h * (PetscScalar)k / zm1,       pn[2].b + pn[2].h * (PetscScalar)k / zm1,       pn[3].b + pn[3].h * (PetscScalar)k / zm1,
290                                             pn[0].b + pn[0].h * (PetscScalar)(k + 1) / zm1, pn[1].b + pn[1].h * (PetscScalar)(k + 1) / zm1, pn[2].b + pn[2].h * (PetscScalar)(k + 1) / zm1, pn[3].b + pn[3].h * (PetscScalar)(k + 1) / zm1};
291   PetscInt          i;
292   for (i = 0; i < 8; i++) zn[i] = PetscRealPart(znl[i]);
293 }
294 
295 /* Tests A and C are from the ISMIP-HOM paper (Pattyn et al. 2008) */
296 static void THIInitialize_HOM_A(THI thi, PetscReal x, PetscReal y, PrmNode *p) {
297   Units     units = thi->units;
298   PetscReal s     = -x * PetscSinReal(thi->alpha);
299 
300   p->b     = s - 1000 * units->meter + 500 * units->meter * PetscSinReal(x * 2 * PETSC_PI / thi->Lx) * PetscSinReal(y * 2 * PETSC_PI / thi->Ly);
301   p->h     = s - p->b;
302   p->beta2 = 1e30;
303 }
304 
305 static void THIInitialize_HOM_C(THI thi, PetscReal x, PetscReal y, PrmNode *p) {
306   Units     units = thi->units;
307   PetscReal s     = -x * PetscSinReal(thi->alpha);
308 
309   p->b     = s - 1000 * units->meter;
310   p->h     = s - p->b;
311   /* tau_b = beta2 v   is a stress (Pa) */
312   p->beta2 = 1000 * (1 + PetscSinReal(x * 2 * PETSC_PI / thi->Lx) * PetscSinReal(y * 2 * PETSC_PI / thi->Ly)) * units->Pascal * units->year / units->meter;
313 }
314 
315 /* These are just toys */
316 
317 /* Same bed as test A, free slip everywhere except for a discontinuous jump to a circular sticky region in the middle. */
318 static void THIInitialize_HOM_X(THI thi, PetscReal xx, PetscReal yy, PrmNode *p) {
319   Units     units = thi->units;
320   PetscReal x = xx * 2 * PETSC_PI / thi->Lx - PETSC_PI, y = yy * 2 * PETSC_PI / thi->Ly - PETSC_PI; /* [-pi,pi] */
321   PetscReal r = PetscSqrtReal(x * x + y * y), s = -x * PetscSinReal(thi->alpha);
322   p->b     = s - 1000 * units->meter + 500 * units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
323   p->h     = s - p->b;
324   p->beta2 = 1000 * (r < 1 ? 2 : 0) * units->Pascal * units->year / units->meter;
325 }
326 
327 /* Like Z, but with 200 meter cliffs */
328 static void THIInitialize_HOM_Y(THI thi, PetscReal xx, PetscReal yy, PrmNode *p) {
329   Units     units = thi->units;
330   PetscReal x = xx * 2 * PETSC_PI / thi->Lx - PETSC_PI, y = yy * 2 * PETSC_PI / thi->Ly - PETSC_PI; /* [-pi,pi] */
331   PetscReal r = PetscSqrtReal(x * x + y * y), s = -x * PetscSinReal(thi->alpha);
332 
333   p->b = s - 1000 * units->meter + 500 * units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
334   if (PetscRealPart(p->b) > -700 * units->meter) p->b += 200 * units->meter;
335   p->h     = s - p->b;
336   p->beta2 = 1000 * (1. + PetscSinReal(PetscSqrtReal(16 * r)) / PetscSqrtReal(1e-2 + 16 * r) * PetscCosReal(x * 3 / 2) * PetscCosReal(y * 3 / 2)) * units->Pascal * units->year / units->meter;
337 }
338 
339 /* Same bed as A, smoothly varying slipperiness, similar to MATLAB's "sombrero" (uncorrelated with bathymetry) */
340 static void THIInitialize_HOM_Z(THI thi, PetscReal xx, PetscReal yy, PrmNode *p) {
341   Units     units = thi->units;
342   PetscReal x = xx * 2 * PETSC_PI / thi->Lx - PETSC_PI, y = yy * 2 * PETSC_PI / thi->Ly - PETSC_PI; /* [-pi,pi] */
343   PetscReal r = PetscSqrtReal(x * x + y * y), s = -x * PetscSinReal(thi->alpha);
344 
345   p->b     = s - 1000 * units->meter + 500 * units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
346   p->h     = s - p->b;
347   p->beta2 = 1000 * (1. + PetscSinReal(PetscSqrtReal(16 * r)) / PetscSqrtReal(1e-2 + 16 * r) * PetscCosReal(x * 3 / 2) * PetscCosReal(y * 3 / 2)) * units->Pascal * units->year / units->meter;
348 }
349 
350 static void THIFriction(THI thi, PetscReal rbeta2, PetscReal gam, PetscReal *beta2, PetscReal *dbeta2) {
351   if (thi->friction.irefgam == 0) {
352     Units units           = thi->units;
353     thi->friction.irefgam = 1. / (0.5 * PetscSqr(thi->friction.refvel * units->meter / units->year));
354     thi->friction.eps2    = 0.5 * PetscSqr(thi->friction.epsvel * units->meter / units->year) * thi->friction.irefgam;
355   }
356   if (thi->friction.exponent == 0) {
357     *beta2  = rbeta2;
358     *dbeta2 = 0;
359   } else {
360     *beta2  = rbeta2 * PetscPowReal(thi->friction.eps2 + gam * thi->friction.irefgam, thi->friction.exponent);
361     *dbeta2 = thi->friction.exponent * *beta2 / (thi->friction.eps2 + gam * thi->friction.irefgam) * thi->friction.irefgam;
362   }
363 }
364 
365 static void THIViscosity(THI thi, PetscReal gam, PetscReal *eta, PetscReal *deta) {
366   PetscReal Bd2, eps, exponent;
367   if (thi->viscosity.Bd2 == 0) {
368     Units           units   = thi->units;
369     const PetscReal n       = 3.,                                                     /* Glen exponent */
370       p                     = 1. + 1. / n,                                            /* for Stokes */
371       A                     = 1.e-16 * PetscPowReal(units->Pascal, -n) / units->year, /* softness parameter (Pa^{-n}/s) */
372       B                     = PetscPowReal(A, -1. / n);                               /* hardness parameter */
373     thi->viscosity.Bd2      = B / 2;
374     thi->viscosity.exponent = (p - 2) / 2;
375     thi->viscosity.eps      = 0.5 * PetscSqr(1e-5 / units->year);
376   }
377   Bd2      = thi->viscosity.Bd2;
378   exponent = thi->viscosity.exponent;
379   eps      = thi->viscosity.eps;
380   *eta     = Bd2 * PetscPowReal(eps + gam, exponent);
381   *deta    = exponent * (*eta) / (eps + gam);
382 }
383 
384 static void RangeUpdate(PetscReal *min, PetscReal *max, PetscReal x) {
385   if (x < *min) *min = x;
386   if (x > *max) *max = x;
387 }
388 
389 static void PRangeClear(PRange *p) {
390   p->cmin = p->min = 1e100;
391   p->cmax = p->max = -1e100;
392 }
393 
394 static PetscErrorCode PRangeMinMax(PRange *p, PetscReal min, PetscReal max) {
395   PetscFunctionBeginUser;
396   p->cmin = min;
397   p->cmax = max;
398   if (min < p->min) p->min = min;
399   if (max > p->max) p->max = max;
400   PetscFunctionReturn(0);
401 }
402 
403 static PetscErrorCode THIDestroy(THI *thi) {
404   PetscFunctionBeginUser;
405   if (!*thi) PetscFunctionReturn(0);
406   if (--((PetscObject)(*thi))->refct > 0) {
407     *thi = 0;
408     PetscFunctionReturn(0);
409   }
410   PetscCall(PetscFree((*thi)->units));
411   PetscCall(PetscFree((*thi)->mattype));
412   PetscCall(PetscHeaderDestroy(thi));
413   PetscFunctionReturn(0);
414 }
415 
416 static PetscErrorCode THICreate(MPI_Comm comm, THI *inthi) {
417   static PetscBool registered = PETSC_FALSE;
418   THI              thi;
419   Units            units;
420 
421   PetscFunctionBeginUser;
422   *inthi = 0;
423   if (!registered) {
424     PetscCall(PetscClassIdRegister("Toy Hydrostatic Ice", &THI_CLASSID));
425     registered = PETSC_TRUE;
426   }
427   PetscCall(PetscHeaderCreate(thi, THI_CLASSID, "THI", "Toy Hydrostatic Ice", "", comm, THIDestroy, 0));
428 
429   PetscCall(PetscNew(&thi->units));
430   units           = thi->units;
431   units->meter    = 1e-2;
432   units->second   = 1e-7;
433   units->kilogram = 1e-12;
434 
435   PetscOptionsBegin(comm, NULL, "Scaled units options", "");
436   {
437     PetscCall(PetscOptionsReal("-units_meter", "1 meter in scaled length units", "", units->meter, &units->meter, NULL));
438     PetscCall(PetscOptionsReal("-units_second", "1 second in scaled time units", "", units->second, &units->second, NULL));
439     PetscCall(PetscOptionsReal("-units_kilogram", "1 kilogram in scaled mass units", "", units->kilogram, &units->kilogram, NULL));
440   }
441   PetscOptionsEnd();
442   units->Pascal = units->kilogram / (units->meter * PetscSqr(units->second));
443   units->year   = 31556926. * units->second; /* seconds per year */
444 
445   thi->Lx              = 10.e3;
446   thi->Ly              = 10.e3;
447   thi->Lz              = 1000;
448   thi->dirichlet_scale = 1;
449   thi->verbose         = PETSC_FALSE;
450 
451   PetscOptionsBegin(comm, NULL, "Toy Hydrostatic Ice options", "");
452   {
453     QuadratureType quad       = QUAD_GAUSS;
454     char           homexp[]   = "A";
455     char           mtype[256] = MATSBAIJ;
456     PetscReal      L, m = 1.0;
457     PetscBool      flg;
458     L = thi->Lx;
459     PetscCall(PetscOptionsReal("-thi_L", "Domain size (m)", "", L, &L, &flg));
460     if (flg) thi->Lx = thi->Ly = L;
461     PetscCall(PetscOptionsReal("-thi_Lx", "X Domain size (m)", "", thi->Lx, &thi->Lx, NULL));
462     PetscCall(PetscOptionsReal("-thi_Ly", "Y Domain size (m)", "", thi->Ly, &thi->Ly, NULL));
463     PetscCall(PetscOptionsReal("-thi_Lz", "Z Domain size (m)", "", thi->Lz, &thi->Lz, NULL));
464     PetscCall(PetscOptionsString("-thi_hom", "ISMIP-HOM experiment (A or C)", "", homexp, homexp, sizeof(homexp), NULL));
465     switch (homexp[0] = toupper(homexp[0])) {
466     case 'A':
467       thi->initialize = THIInitialize_HOM_A;
468       thi->no_slip    = PETSC_TRUE;
469       thi->alpha      = 0.5;
470       break;
471     case 'C':
472       thi->initialize = THIInitialize_HOM_C;
473       thi->no_slip    = PETSC_FALSE;
474       thi->alpha      = 0.1;
475       break;
476     case 'X':
477       thi->initialize = THIInitialize_HOM_X;
478       thi->no_slip    = PETSC_FALSE;
479       thi->alpha      = 0.3;
480       break;
481     case 'Y':
482       thi->initialize = THIInitialize_HOM_Y;
483       thi->no_slip    = PETSC_FALSE;
484       thi->alpha      = 0.5;
485       break;
486     case 'Z':
487       thi->initialize = THIInitialize_HOM_Z;
488       thi->no_slip    = PETSC_FALSE;
489       thi->alpha      = 0.5;
490       break;
491     default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "HOM experiment '%c' not implemented", homexp[0]);
492     }
493     PetscCall(PetscOptionsEnum("-thi_quadrature", "Quadrature to use for 3D elements", "", QuadratureTypes, (PetscEnum)quad, (PetscEnum *)&quad, NULL));
494     switch (quad) {
495     case QUAD_GAUSS:
496       HexQInterp = HexQInterp_Gauss;
497       HexQDeriv  = HexQDeriv_Gauss;
498       break;
499     case QUAD_LOBATTO:
500       HexQInterp = HexQInterp_Lobatto;
501       HexQDeriv  = HexQDeriv_Lobatto;
502       break;
503     }
504     PetscCall(PetscOptionsReal("-thi_alpha", "Bed angle (degrees)", "", thi->alpha, &thi->alpha, NULL));
505 
506     thi->friction.refvel = 100.;
507     thi->friction.epsvel = 1.;
508 
509     PetscCall(PetscOptionsReal("-thi_friction_refvel", "Reference velocity for sliding", "", thi->friction.refvel, &thi->friction.refvel, NULL));
510     PetscCall(PetscOptionsReal("-thi_friction_epsvel", "Regularization velocity for sliding", "", thi->friction.epsvel, &thi->friction.epsvel, NULL));
511     PetscCall(PetscOptionsReal("-thi_friction_m", "Friction exponent, 0=Coulomb, 1=Navier", "", m, &m, NULL));
512 
513     thi->friction.exponent = (m - 1) / 2;
514 
515     PetscCall(PetscOptionsReal("-thi_dirichlet_scale", "Scale Dirichlet boundary conditions by this factor", "", thi->dirichlet_scale, &thi->dirichlet_scale, NULL));
516     PetscCall(PetscOptionsReal("-thi_ssa_friction_scale", "Scale slip boundary conditions by this factor in SSA (2D) assembly", "", thi->ssa_friction_scale, &thi->ssa_friction_scale, NULL));
517     PetscCall(PetscOptionsBool("-thi_coarse2d", "Use a 2D coarse space corresponding to SSA", "", thi->coarse2d, &thi->coarse2d, NULL));
518     PetscCall(PetscOptionsBool("-thi_tridiagonal", "Assemble a tridiagonal system (column coupling only) on the finest level", "", thi->tridiagonal, &thi->tridiagonal, NULL));
519     PetscCall(PetscOptionsFList("-thi_mat_type", "Matrix type", "MatSetType", MatList, mtype, (char *)mtype, sizeof(mtype), NULL));
520     PetscCall(PetscStrallocpy(mtype, (char **)&thi->mattype));
521     PetscCall(PetscOptionsBool("-thi_verbose", "Enable verbose output (like matrix sizes and statistics)", "", thi->verbose, &thi->verbose, NULL));
522   }
523   PetscOptionsEnd();
524 
525   /* dimensionalize */
526   thi->Lx *= units->meter;
527   thi->Ly *= units->meter;
528   thi->Lz *= units->meter;
529   thi->alpha *= PETSC_PI / 180;
530 
531   PRangeClear(&thi->eta);
532   PRangeClear(&thi->beta2);
533 
534   {
535     PetscReal u = 1000 * units->meter / (3e7 * units->second), gradu = u / (100 * units->meter), eta, deta, rho = 910 * units->kilogram / PetscPowReal(units->meter, 3), grav = 9.81 * units->meter / PetscSqr(units->second),
536               driving = rho * grav * PetscSinReal(thi->alpha) * 1000 * units->meter;
537     THIViscosity(thi, 0.5 * gradu * gradu, &eta, &deta);
538     thi->rhog = rho * grav;
539     if (thi->verbose) {
540       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi), "Units: meter %8.2g  second %8.2g  kg %8.2g  Pa %8.2g\n", (double)units->meter, (double)units->second, (double)units->kilogram, (double)units->Pascal));
541       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi), "Domain (%6.2g,%6.2g,%6.2g), pressure %8.2g, driving stress %8.2g\n", (double)thi->Lx, (double)thi->Ly, (double)thi->Lz, (double)(rho * grav * 1e3 * units->meter), (double)driving));
542       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi), "Large velocity 1km/a %8.2g, velocity gradient %8.2g, eta %8.2g, stress %8.2g, ratio %8.2g\n", (double)u, (double)gradu, (double)eta, (double)(2 * eta * gradu), (double)(2 * eta * gradu / driving)));
543       THIViscosity(thi, 0.5 * PetscSqr(1e-3 * gradu), &eta, &deta);
544       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi), "Small velocity 1m/a  %8.2g, velocity gradient %8.2g, eta %8.2g, stress %8.2g, ratio %8.2g\n", (double)(1e-3 * u), (double)(1e-3 * gradu), (double)eta, (double)(2 * eta * 1e-3 * gradu), (double)(2 * eta * 1e-3 * gradu / driving)));
545     }
546   }
547 
548   *inthi = thi;
549   PetscFunctionReturn(0);
550 }
551 
552 static PetscErrorCode THIInitializePrm(THI thi, DM da2prm, Vec prm) {
553   PrmNode **p;
554   PetscInt  i, j, xs, xm, ys, ym, mx, my;
555 
556   PetscFunctionBeginUser;
557   PetscCall(DMDAGetGhostCorners(da2prm, &ys, &xs, 0, &ym, &xm, 0));
558   PetscCall(DMDAGetInfo(da2prm, 0, &my, &mx, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
559   PetscCall(DMDAVecGetArray(da2prm, prm, &p));
560   for (i = xs; i < xs + xm; i++) {
561     for (j = ys; j < ys + ym; j++) {
562       PetscReal xx = thi->Lx * i / mx, yy = thi->Ly * j / my;
563       thi->initialize(thi, xx, yy, &p[i][j]);
564     }
565   }
566   PetscCall(DMDAVecRestoreArray(da2prm, prm, &p));
567   PetscFunctionReturn(0);
568 }
569 
570 static PetscErrorCode THISetUpDM(THI thi, DM dm) {
571   PetscInt        refinelevel, coarsenlevel, level, dim, Mx, My, Mz, mx, my, s;
572   DMDAStencilType st;
573   DM              da2prm;
574   Vec             X;
575 
576   PetscFunctionBeginUser;
577   PetscCall(DMDAGetInfo(dm, &dim, &Mz, &My, &Mx, 0, &my, &mx, 0, &s, 0, 0, 0, &st));
578   if (dim == 2) PetscCall(DMDAGetInfo(dm, &dim, &My, &Mx, 0, &my, &mx, 0, 0, &s, 0, 0, 0, &st));
579   PetscCall(DMGetRefineLevel(dm, &refinelevel));
580   PetscCall(DMGetCoarsenLevel(dm, &coarsenlevel));
581   level = refinelevel - coarsenlevel;
582   PetscCall(DMDACreate2d(PetscObjectComm((PetscObject)thi), DM_BOUNDARY_PERIODIC, DM_BOUNDARY_PERIODIC, st, My, Mx, my, mx, sizeof(PrmNode) / sizeof(PetscScalar), s, 0, 0, &da2prm));
583   PetscCall(DMSetUp(da2prm));
584   PetscCall(DMCreateLocalVector(da2prm, &X));
585   {
586     PetscReal Lx = thi->Lx / thi->units->meter, Ly = thi->Ly / thi->units->meter, Lz = thi->Lz / thi->units->meter;
587     if (dim == 2) {
588       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi), "Level %" PetscInt_FMT " domain size (m) %8.2g x %8.2g, num elements %" PetscInt_FMT " x %" PetscInt_FMT " (%" PetscInt_FMT "), size (m) %g x %g\n", level, (double)Lx, (double)Ly, Mx, My, Mx * My, (double)(Lx / Mx), (double)(Ly / My)));
589     } else {
590       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi), "Level %" PetscInt_FMT " domain size (m) %8.2g x %8.2g x %8.2g, num elements %" PetscInt_FMT " x %" PetscInt_FMT " x %" PetscInt_FMT " (%" PetscInt_FMT "), size (m) %g x %g x %g\n", level, (double)Lx, (double)Ly, (double)Lz, Mx, My, Mz, Mx * My * Mz, (double)(Lx / Mx), (double)(Ly / My), (double)(1000. / (Mz - 1))));
591     }
592   }
593   PetscCall(THIInitializePrm(thi, da2prm, X));
594   if (thi->tridiagonal) { /* Reset coarse Jacobian evaluation */
595     PetscCall(DMDASNESSetJacobianLocal(dm, (DMDASNESJacobian)THIJacobianLocal_3D_Full, thi));
596   }
597   if (thi->coarse2d) PetscCall(DMDASNESSetJacobianLocal(dm, (DMDASNESJacobian)THIJacobianLocal_2D, thi));
598   PetscCall(PetscObjectCompose((PetscObject)dm, "DMDA2Prm", (PetscObject)da2prm));
599   PetscCall(PetscObjectCompose((PetscObject)dm, "DMDA2Prm_Vec", (PetscObject)X));
600   PetscCall(DMDestroy(&da2prm));
601   PetscCall(VecDestroy(&X));
602   PetscFunctionReturn(0);
603 }
604 
605 static PetscErrorCode DMCoarsenHook_THI(DM dmf, DM dmc, void *ctx) {
606   THI      thi = (THI)ctx;
607   PetscInt rlevel, clevel;
608 
609   PetscFunctionBeginUser;
610   PetscCall(THISetUpDM(thi, dmc));
611   PetscCall(DMGetRefineLevel(dmc, &rlevel));
612   PetscCall(DMGetCoarsenLevel(dmc, &clevel));
613   if (rlevel - clevel == 0) PetscCall(DMSetMatType(dmc, MATAIJ));
614   PetscCall(DMCoarsenHookAdd(dmc, DMCoarsenHook_THI, NULL, thi));
615   PetscFunctionReturn(0);
616 }
617 
618 static PetscErrorCode DMRefineHook_THI(DM dmc, DM dmf, void *ctx) {
619   THI thi = (THI)ctx;
620 
621   PetscFunctionBeginUser;
622   PetscCall(THISetUpDM(thi, dmf));
623   PetscCall(DMSetMatType(dmf, thi->mattype));
624   PetscCall(DMRefineHookAdd(dmf, DMRefineHook_THI, NULL, thi));
625   /* With grid sequencing, a formerly-refined DM will later be coarsened by PCSetUp_MG */
626   PetscCall(DMCoarsenHookAdd(dmf, DMCoarsenHook_THI, NULL, thi));
627   PetscFunctionReturn(0);
628 }
629 
630 static PetscErrorCode THIDAGetPrm(DM da, PrmNode ***prm) {
631   DM  da2prm;
632   Vec X;
633 
634   PetscFunctionBeginUser;
635   PetscCall(PetscObjectQuery((PetscObject)da, "DMDA2Prm", (PetscObject *)&da2prm));
636   PetscCheck(da2prm, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "No DMDA2Prm composed with given DMDA");
637   PetscCall(PetscObjectQuery((PetscObject)da, "DMDA2Prm_Vec", (PetscObject *)&X));
638   PetscCheck(X, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "No DMDA2Prm_Vec composed with given DMDA");
639   PetscCall(DMDAVecGetArray(da2prm, X, prm));
640   PetscFunctionReturn(0);
641 }
642 
643 static PetscErrorCode THIDARestorePrm(DM da, PrmNode ***prm) {
644   DM  da2prm;
645   Vec X;
646 
647   PetscFunctionBeginUser;
648   PetscCall(PetscObjectQuery((PetscObject)da, "DMDA2Prm", (PetscObject *)&da2prm));
649   PetscCheck(da2prm, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "No DMDA2Prm composed with given DMDA");
650   PetscCall(PetscObjectQuery((PetscObject)da, "DMDA2Prm_Vec", (PetscObject *)&X));
651   PetscCheck(X, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "No DMDA2Prm_Vec composed with given DMDA");
652   PetscCall(DMDAVecRestoreArray(da2prm, X, prm));
653   PetscFunctionReturn(0);
654 }
655 
656 static PetscErrorCode THIInitial(SNES snes, Vec X, void *ctx) {
657   THI       thi;
658   PetscInt  i, j, k, xs, xm, ys, ym, zs, zm, mx, my;
659   PetscReal hx, hy;
660   PrmNode **prm;
661   Node   ***x;
662   DM        da;
663 
664   PetscFunctionBeginUser;
665   PetscCall(SNESGetDM(snes, &da));
666   PetscCall(DMGetApplicationContext(da, &thi));
667   PetscCall(DMDAGetInfo(da, 0, 0, &my, &mx, 0, 0, 0, 0, 0, 0, 0, 0, 0));
668   PetscCall(DMDAGetCorners(da, &zs, &ys, &xs, &zm, &ym, &xm));
669   PetscCall(DMDAVecGetArray(da, X, &x));
670   PetscCall(THIDAGetPrm(da, &prm));
671   hx = thi->Lx / mx;
672   hy = thi->Ly / my;
673   for (i = xs; i < xs + xm; i++) {
674     for (j = ys; j < ys + ym; j++) {
675       for (k = zs; k < zs + zm; k++) {
676         const PetscScalar zm1 = zm - 1, drivingx = thi->rhog * (prm[i + 1][j].b + prm[i + 1][j].h - prm[i - 1][j].b - prm[i - 1][j].h) / (2 * hx), drivingy = thi->rhog * (prm[i][j + 1].b + prm[i][j + 1].h - prm[i][j - 1].b - prm[i][j - 1].h) / (2 * hy);
677         x[i][j][k].u = 0. * drivingx * prm[i][j].h * (PetscScalar)k / zm1;
678         x[i][j][k].v = 0. * drivingy * prm[i][j].h * (PetscScalar)k / zm1;
679       }
680     }
681   }
682   PetscCall(DMDAVecRestoreArray(da, X, &x));
683   PetscCall(THIDARestorePrm(da, &prm));
684   PetscFunctionReturn(0);
685 }
686 
687 static void PointwiseNonlinearity(THI thi, const Node n[PETSC_RESTRICT], const PetscReal phi[PETSC_RESTRICT], PetscReal dphi[PETSC_RESTRICT][3], PetscScalar *PETSC_RESTRICT u, PetscScalar *PETSC_RESTRICT v, PetscScalar du[PETSC_RESTRICT], PetscScalar dv[PETSC_RESTRICT], PetscReal *eta, PetscReal *deta) {
688   PetscInt    l, ll;
689   PetscScalar gam;
690 
691   du[0] = du[1] = du[2] = 0;
692   dv[0] = dv[1] = dv[2] = 0;
693   *u                    = 0;
694   *v                    = 0;
695   for (l = 0; l < 8; l++) {
696     *u += phi[l] * n[l].u;
697     *v += phi[l] * n[l].v;
698     for (ll = 0; ll < 3; ll++) {
699       du[ll] += dphi[l][ll] * n[l].u;
700       dv[ll] += dphi[l][ll] * n[l].v;
701     }
702   }
703   gam = PetscSqr(du[0]) + PetscSqr(dv[1]) + du[0] * dv[1] + 0.25 * PetscSqr(du[1] + dv[0]) + 0.25 * PetscSqr(du[2]) + 0.25 * PetscSqr(dv[2]);
704   THIViscosity(thi, PetscRealPart(gam), eta, deta);
705 }
706 
707 static void PointwiseNonlinearity2D(THI thi, Node n[], PetscReal phi[], PetscReal dphi[4][2], PetscScalar *u, PetscScalar *v, PetscScalar du[], PetscScalar dv[], PetscReal *eta, PetscReal *deta) {
708   PetscInt    l, ll;
709   PetscScalar gam;
710 
711   du[0] = du[1] = 0;
712   dv[0] = dv[1] = 0;
713   *u            = 0;
714   *v            = 0;
715   for (l = 0; l < 4; l++) {
716     *u += phi[l] * n[l].u;
717     *v += phi[l] * n[l].v;
718     for (ll = 0; ll < 2; ll++) {
719       du[ll] += dphi[l][ll] * n[l].u;
720       dv[ll] += dphi[l][ll] * n[l].v;
721     }
722   }
723   gam = PetscSqr(du[0]) + PetscSqr(dv[1]) + du[0] * dv[1] + 0.25 * PetscSqr(du[1] + dv[0]);
724   THIViscosity(thi, PetscRealPart(gam), eta, deta);
725 }
726 
727 static PetscErrorCode THIFunctionLocal(DMDALocalInfo *info, Node ***x, Node ***f, THI thi) {
728   PetscInt  xs, ys, xm, ym, zm, i, j, k, q, l;
729   PetscReal hx, hy, etamin, etamax, beta2min, beta2max;
730   PrmNode **prm;
731 
732   PetscFunctionBeginUser;
733   xs = info->zs;
734   ys = info->ys;
735   xm = info->zm;
736   ym = info->ym;
737   zm = info->xm;
738   hx = thi->Lx / info->mz;
739   hy = thi->Ly / info->my;
740 
741   etamin   = 1e100;
742   etamax   = 0;
743   beta2min = 1e100;
744   beta2max = 0;
745 
746   PetscCall(THIDAGetPrm(info->da, &prm));
747 
748   for (i = xs; i < xs + xm; i++) {
749     for (j = ys; j < ys + ym; j++) {
750       PrmNode pn[4];
751       QuadExtract(prm, i, j, pn);
752       for (k = 0; k < zm - 1; k++) {
753         PetscInt  ls = 0;
754         Node      n[8], *fn[8];
755         PetscReal zn[8], etabase = 0;
756         PrmHexGetZ(pn, k, zm, zn);
757         HexExtract(x, i, j, k, n);
758         HexExtractRef(f, i, j, k, fn);
759         if (thi->no_slip && k == 0) {
760           for (l = 0; l < 4; l++) n[l].u = n[l].v = 0;
761           /* The first 4 basis functions lie on the bottom layer, so their contribution is exactly 0, hence we can skip them */
762           ls = 4;
763         }
764         for (q = 0; q < 8; q++) {
765           PetscReal   dz[3], phi[8], dphi[8][3], jw, eta, deta;
766           PetscScalar du[3], dv[3], u, v;
767           HexGrad(HexQDeriv[q], zn, dz);
768           HexComputeGeometry(q, hx, hy, dz, phi, dphi, &jw);
769           PointwiseNonlinearity(thi, n, phi, dphi, &u, &v, du, dv, &eta, &deta);
770           jw /= thi->rhog; /* scales residuals to be O(1) */
771           if (q == 0) etabase = eta;
772           RangeUpdate(&etamin, &etamax, eta);
773           for (l = ls; l < 8; l++) { /* test functions */
774             const PetscReal ds[2] = {-PetscSinReal(thi->alpha), 0};
775             const PetscReal pp = phi[l], *dp = dphi[l];
776             fn[l]->u += dp[0] * jw * eta * (4. * du[0] + 2. * dv[1]) + dp[1] * jw * eta * (du[1] + dv[0]) + dp[2] * jw * eta * du[2] + pp * jw * thi->rhog * ds[0];
777             fn[l]->v += dp[1] * jw * eta * (2. * du[0] + 4. * dv[1]) + dp[0] * jw * eta * (du[1] + dv[0]) + dp[2] * jw * eta * dv[2] + pp * jw * thi->rhog * ds[1];
778           }
779         }
780         if (k == 0) { /* we are on a bottom face */
781           if (thi->no_slip) {
782             /* Note: Non-Galerkin coarse grid operators are very sensitive to the scaling of Dirichlet boundary
783             * conditions.  After shenanigans above, etabase contains the effective viscosity at the closest quadrature
784             * point to the bed.  We want the diagonal entry in the Dirichlet condition to have similar magnitude to the
785             * diagonal entry corresponding to the adjacent node.  The fundamental scaling of the viscous part is in
786             * diagu, diagv below.  This scaling is easy to recognize by considering the finite difference operator after
787             * scaling by element size.  The no-slip Dirichlet condition is scaled by this factor, and also in the
788             * assembled matrix (see the similar block in THIJacobianLocal).
789             *
790             * Note that the residual at this Dirichlet node is linear in the state at this node, but also depends
791             * (nonlinearly in general) on the neighboring interior nodes through the local viscosity.  This will make
792             * a matrix-free Jacobian have extra entries in the corresponding row.  We assemble only the diagonal part,
793             * so the solution will exactly satisfy the boundary condition after the first linear iteration.
794             */
795             const PetscReal   hz    = PetscRealPart(pn[0].h) / (zm - 1.);
796             const PetscScalar diagu = 2 * etabase / thi->rhog * (hx * hy / hz + hx * hz / hy + 4 * hy * hz / hx), diagv = 2 * etabase / thi->rhog * (hx * hy / hz + 4 * hx * hz / hy + hy * hz / hx);
797             fn[0]->u = thi->dirichlet_scale * diagu * x[i][j][k].u;
798             fn[0]->v = thi->dirichlet_scale * diagv * x[i][j][k].v;
799           } else { /* Integrate over bottom face to apply boundary condition */
800             for (q = 0; q < 4; q++) {
801               const PetscReal jw = 0.25 * hx * hy / thi->rhog, *phi = QuadQInterp[q];
802               PetscScalar     u = 0, v = 0, rbeta2 = 0;
803               PetscReal       beta2, dbeta2;
804               for (l = 0; l < 4; l++) {
805                 u += phi[l] * n[l].u;
806                 v += phi[l] * n[l].v;
807                 rbeta2 += phi[l] * pn[l].beta2;
808               }
809               THIFriction(thi, PetscRealPart(rbeta2), PetscRealPart(u * u + v * v) / 2, &beta2, &dbeta2);
810               RangeUpdate(&beta2min, &beta2max, beta2);
811               for (l = 0; l < 4; l++) {
812                 const PetscReal pp = phi[l];
813                 fn[ls + l]->u += pp * jw * beta2 * u;
814                 fn[ls + l]->v += pp * jw * beta2 * v;
815               }
816             }
817           }
818         }
819       }
820     }
821   }
822 
823   PetscCall(THIDARestorePrm(info->da, &prm));
824 
825   PetscCall(PRangeMinMax(&thi->eta, etamin, etamax));
826   PetscCall(PRangeMinMax(&thi->beta2, beta2min, beta2max));
827   PetscFunctionReturn(0);
828 }
829 
830 static PetscErrorCode THIMatrixStatistics(THI thi, Mat B, PetscViewer viewer) {
831   PetscReal   nrm;
832   PetscInt    m;
833   PetscMPIInt rank;
834 
835   PetscFunctionBeginUser;
836   PetscCall(MatNorm(B, NORM_FROBENIUS, &nrm));
837   PetscCall(MatGetSize(B, &m, 0));
838   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B), &rank));
839   if (rank == 0) {
840     PetscScalar val0, val2;
841     PetscCall(MatGetValue(B, 0, 0, &val0));
842     PetscCall(MatGetValue(B, 2, 2, &val2));
843     PetscCall(PetscViewerASCIIPrintf(viewer, "Matrix dim %" PetscInt_FMT " norm %8.2e (0,0) %8.2e  (2,2) %8.2e %8.2e <= eta <= %8.2e %8.2e <= beta2 <= %8.2e\n", m, (double)nrm, (double)PetscRealPart(val0), (double)PetscRealPart(val2),
844                                      (double)thi->eta.cmin, (double)thi->eta.cmax, (double)thi->beta2.cmin, (double)thi->beta2.cmax));
845   }
846   PetscFunctionReturn(0);
847 }
848 
849 static PetscErrorCode THISurfaceStatistics(DM da, Vec X, PetscReal *min, PetscReal *max, PetscReal *mean) {
850   Node     ***x;
851   PetscInt    i, j, xs, ys, zs, xm, ym, zm, mx, my, mz;
852   PetscReal   umin = 1e100, umax = -1e100;
853   PetscScalar usum = 0.0, gusum;
854 
855   PetscFunctionBeginUser;
856   *min = *max = *mean = 0;
857   PetscCall(DMDAGetInfo(da, 0, &mz, &my, &mx, 0, 0, 0, 0, 0, 0, 0, 0, 0));
858   PetscCall(DMDAGetCorners(da, &zs, &ys, &xs, &zm, &ym, &xm));
859   PetscCheck(zs == 0 && zm == mz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected decomposition");
860   PetscCall(DMDAVecGetArray(da, X, &x));
861   for (i = xs; i < xs + xm; i++) {
862     for (j = ys; j < ys + ym; j++) {
863       PetscReal u = PetscRealPart(x[i][j][zm - 1].u);
864       RangeUpdate(&umin, &umax, u);
865       usum += u;
866     }
867   }
868   PetscCall(DMDAVecRestoreArray(da, X, &x));
869   PetscCallMPI(MPI_Allreduce(&umin, min, 1, MPIU_REAL, MPIU_MIN, PetscObjectComm((PetscObject)da)));
870   PetscCallMPI(MPI_Allreduce(&umax, max, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)da)));
871   PetscCallMPI(MPI_Allreduce(&usum, &gusum, 1, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)da)));
872   *mean = PetscRealPart(gusum) / (mx * my);
873   PetscFunctionReturn(0);
874 }
875 
876 static PetscErrorCode THISolveStatistics(THI thi, SNES snes, PetscInt coarsened, const char name[]) {
877   MPI_Comm comm;
878   Vec      X;
879   DM       dm;
880 
881   PetscFunctionBeginUser;
882   PetscCall(PetscObjectGetComm((PetscObject)thi, &comm));
883   PetscCall(SNESGetSolution(snes, &X));
884   PetscCall(SNESGetDM(snes, &dm));
885   PetscCall(PetscPrintf(comm, "Solution statistics after solve: %s\n", name));
886   {
887     PetscInt            its, lits;
888     SNESConvergedReason reason;
889     PetscCall(SNESGetIterationNumber(snes, &its));
890     PetscCall(SNESGetConvergedReason(snes, &reason));
891     PetscCall(SNESGetLinearSolveIterations(snes, &lits));
892     PetscCall(PetscPrintf(comm, "%s: Number of SNES iterations = %" PetscInt_FMT ", total linear iterations = %" PetscInt_FMT "\n", SNESConvergedReasons[reason], its, lits));
893   }
894   {
895     PetscReal          nrm2, tmin[3] = {1e100, 1e100, 1e100}, tmax[3] = {-1e100, -1e100, -1e100}, min[3], max[3];
896     PetscInt           i, j, m;
897     const PetscScalar *x;
898     PetscCall(VecNorm(X, NORM_2, &nrm2));
899     PetscCall(VecGetLocalSize(X, &m));
900     PetscCall(VecGetArrayRead(X, &x));
901     for (i = 0; i < m; i += 2) {
902       PetscReal u = PetscRealPart(x[i]), v = PetscRealPart(x[i + 1]), c = PetscSqrtReal(u * u + v * v);
903       tmin[0] = PetscMin(u, tmin[0]);
904       tmin[1] = PetscMin(v, tmin[1]);
905       tmin[2] = PetscMin(c, tmin[2]);
906       tmax[0] = PetscMax(u, tmax[0]);
907       tmax[1] = PetscMax(v, tmax[1]);
908       tmax[2] = PetscMax(c, tmax[2]);
909     }
910     PetscCall(VecRestoreArrayRead(X, &x));
911     PetscCallMPI(MPI_Allreduce(tmin, min, 3, MPIU_REAL, MPIU_MIN, PetscObjectComm((PetscObject)thi)));
912     PetscCallMPI(MPI_Allreduce(tmax, max, 3, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)thi)));
913     /* Dimensionalize to meters/year */
914     nrm2 *= thi->units->year / thi->units->meter;
915     for (j = 0; j < 3; j++) {
916       min[j] *= thi->units->year / thi->units->meter;
917       max[j] *= thi->units->year / thi->units->meter;
918     }
919     if (min[0] == 0.0) min[0] = 0.0;
920     PetscCall(PetscPrintf(comm, "|X|_2 %g   %g <= u <=  %g   %g <= v <=  %g   %g <= c <=  %g \n", (double)nrm2, (double)min[0], (double)max[0], (double)min[1], (double)max[1], (double)min[2], (double)max[2]));
921     {
922       PetscReal umin, umax, umean;
923       PetscCall(THISurfaceStatistics(dm, X, &umin, &umax, &umean));
924       umin *= thi->units->year / thi->units->meter;
925       umax *= thi->units->year / thi->units->meter;
926       umean *= thi->units->year / thi->units->meter;
927       PetscCall(PetscPrintf(comm, "Surface statistics: u in [%12.6e, %12.6e] mean %12.6e\n", (double)umin, (double)umax, (double)umean));
928     }
929     /* These values stay nondimensional */
930     PetscCall(PetscPrintf(comm, "Global eta range   %g to %g converged range %g to %g\n", (double)thi->eta.min, (double)thi->eta.max, (double)thi->eta.cmin, (double)thi->eta.cmax));
931     PetscCall(PetscPrintf(comm, "Global beta2 range %g to %g converged range %g to %g\n", (double)thi->beta2.min, (double)thi->beta2.max, (double)thi->beta2.cmin, (double)thi->beta2.cmax));
932   }
933   PetscFunctionReturn(0);
934 }
935 
936 static PetscErrorCode THIJacobianLocal_2D(DMDALocalInfo *info, Node **x, Mat J, Mat B, THI thi) {
937   PetscInt  xs, ys, xm, ym, i, j, q, l, ll;
938   PetscReal hx, hy;
939   PrmNode **prm;
940 
941   PetscFunctionBeginUser;
942   xs = info->ys;
943   ys = info->xs;
944   xm = info->ym;
945   ym = info->xm;
946   hx = thi->Lx / info->my;
947   hy = thi->Ly / info->mx;
948 
949   PetscCall(MatZeroEntries(B));
950   PetscCall(THIDAGetPrm(info->da, &prm));
951 
952   for (i = xs; i < xs + xm; i++) {
953     for (j = ys; j < ys + ym; j++) {
954       Node        n[4];
955       PrmNode     pn[4];
956       PetscScalar Ke[4 * 2][4 * 2];
957       QuadExtract(prm, i, j, pn);
958       QuadExtract(x, i, j, n);
959       PetscCall(PetscMemzero(Ke, sizeof(Ke)));
960       for (q = 0; q < 4; q++) {
961         PetscReal   phi[4], dphi[4][2], jw, eta, deta, beta2, dbeta2;
962         PetscScalar u, v, du[2], dv[2], h = 0, rbeta2 = 0;
963         for (l = 0; l < 4; l++) {
964           phi[l]     = QuadQInterp[q][l];
965           dphi[l][0] = QuadQDeriv[q][l][0] * 2. / hx;
966           dphi[l][1] = QuadQDeriv[q][l][1] * 2. / hy;
967           h += phi[l] * pn[l].h;
968           rbeta2 += phi[l] * pn[l].beta2;
969         }
970         jw = 0.25 * hx * hy / thi->rhog; /* rhog is only scaling */
971         PointwiseNonlinearity2D(thi, n, phi, dphi, &u, &v, du, dv, &eta, &deta);
972         THIFriction(thi, PetscRealPart(rbeta2), PetscRealPart(u * u + v * v) / 2, &beta2, &dbeta2);
973         for (l = 0; l < 4; l++) {
974           const PetscReal pp = phi[l], *dp = dphi[l];
975           for (ll = 0; ll < 4; ll++) {
976             const PetscReal ppl = phi[ll], *dpl = dphi[ll];
977             PetscScalar     dgdu, dgdv;
978             dgdu = 2. * du[0] * dpl[0] + dv[1] * dpl[0] + 0.5 * (du[1] + dv[0]) * dpl[1];
979             dgdv = 2. * dv[1] * dpl[1] + du[0] * dpl[1] + 0.5 * (du[1] + dv[0]) * dpl[0];
980             /* Picard part */
981             Ke[l * 2 + 0][ll * 2 + 0] += dp[0] * jw * eta * 4. * dpl[0] + dp[1] * jw * eta * dpl[1] + pp * jw * (beta2 / h) * ppl * thi->ssa_friction_scale;
982             Ke[l * 2 + 0][ll * 2 + 1] += dp[0] * jw * eta * 2. * dpl[1] + dp[1] * jw * eta * dpl[0];
983             Ke[l * 2 + 1][ll * 2 + 0] += dp[1] * jw * eta * 2. * dpl[0] + dp[0] * jw * eta * dpl[1];
984             Ke[l * 2 + 1][ll * 2 + 1] += dp[1] * jw * eta * 4. * dpl[1] + dp[0] * jw * eta * dpl[0] + pp * jw * (beta2 / h) * ppl * thi->ssa_friction_scale;
985             /* extra Newton terms */
986             Ke[l * 2 + 0][ll * 2 + 0] += dp[0] * jw * deta * dgdu * (4. * du[0] + 2. * dv[1]) + dp[1] * jw * deta * dgdu * (du[1] + dv[0]) + pp * jw * (dbeta2 / h) * u * u * ppl * thi->ssa_friction_scale;
987             Ke[l * 2 + 0][ll * 2 + 1] += dp[0] * jw * deta * dgdv * (4. * du[0] + 2. * dv[1]) + dp[1] * jw * deta * dgdv * (du[1] + dv[0]) + pp * jw * (dbeta2 / h) * u * v * ppl * thi->ssa_friction_scale;
988             Ke[l * 2 + 1][ll * 2 + 0] += dp[1] * jw * deta * dgdu * (4. * dv[1] + 2. * du[0]) + dp[0] * jw * deta * dgdu * (du[1] + dv[0]) + pp * jw * (dbeta2 / h) * v * u * ppl * thi->ssa_friction_scale;
989             Ke[l * 2 + 1][ll * 2 + 1] += dp[1] * jw * deta * dgdv * (4. * dv[1] + 2. * du[0]) + dp[0] * jw * deta * dgdv * (du[1] + dv[0]) + pp * jw * (dbeta2 / h) * v * v * ppl * thi->ssa_friction_scale;
990           }
991         }
992       }
993       {
994         const MatStencil rc[4] = {
995           {0, i,     j,     0},
996           {0, i + 1, j,     0},
997           {0, i + 1, j + 1, 0},
998           {0, i,     j + 1, 0}
999         };
1000         PetscCall(MatSetValuesBlockedStencil(B, 4, rc, 4, rc, &Ke[0][0], ADD_VALUES));
1001       }
1002     }
1003   }
1004   PetscCall(THIDARestorePrm(info->da, &prm));
1005 
1006   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1007   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1008   PetscCall(MatSetOption(B, MAT_SYMMETRIC, PETSC_TRUE));
1009   if (thi->verbose) PetscCall(THIMatrixStatistics(thi, B, PETSC_VIEWER_STDOUT_WORLD));
1010   PetscFunctionReturn(0);
1011 }
1012 
1013 static PetscErrorCode THIJacobianLocal_3D(DMDALocalInfo *info, Node ***x, Mat B, THI thi, THIAssemblyMode amode) {
1014   PetscInt  xs, ys, xm, ym, zm, i, j, k, q, l, ll;
1015   PetscReal hx, hy;
1016   PrmNode **prm;
1017 
1018   PetscFunctionBeginUser;
1019   xs = info->zs;
1020   ys = info->ys;
1021   xm = info->zm;
1022   ym = info->ym;
1023   zm = info->xm;
1024   hx = thi->Lx / info->mz;
1025   hy = thi->Ly / info->my;
1026 
1027   PetscCall(MatZeroEntries(B));
1028   PetscCall(MatSetOption(B, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE));
1029   PetscCall(THIDAGetPrm(info->da, &prm));
1030 
1031   for (i = xs; i < xs + xm; i++) {
1032     for (j = ys; j < ys + ym; j++) {
1033       PrmNode pn[4];
1034       QuadExtract(prm, i, j, pn);
1035       for (k = 0; k < zm - 1; k++) {
1036         Node        n[8];
1037         PetscReal   zn[8], etabase = 0;
1038         PetscScalar Ke[8 * 2][8 * 2];
1039         PetscInt    ls = 0;
1040 
1041         PrmHexGetZ(pn, k, zm, zn);
1042         HexExtract(x, i, j, k, n);
1043         PetscCall(PetscMemzero(Ke, sizeof(Ke)));
1044         if (thi->no_slip && k == 0) {
1045           for (l = 0; l < 4; l++) n[l].u = n[l].v = 0;
1046           ls = 4;
1047         }
1048         for (q = 0; q < 8; q++) {
1049           PetscReal   dz[3], phi[8], dphi[8][3], jw, eta, deta;
1050           PetscScalar du[3], dv[3], u, v;
1051           HexGrad(HexQDeriv[q], zn, dz);
1052           HexComputeGeometry(q, hx, hy, dz, phi, dphi, &jw);
1053           PointwiseNonlinearity(thi, n, phi, dphi, &u, &v, du, dv, &eta, &deta);
1054           jw /= thi->rhog; /* residuals are scaled by this factor */
1055           if (q == 0) etabase = eta;
1056           for (l = ls; l < 8; l++) { /* test functions */
1057             const PetscReal *PETSC_RESTRICT dp = dphi[l];
1058 #if USE_SSE2_KERNELS
1059             /* gcc (up to my 4.5 snapshot) is really bad at hoisting intrinsics so we do it manually */
1060             __m128d p4 = _mm_set1_pd(4), p2 = _mm_set1_pd(2), p05 = _mm_set1_pd(0.5), p42 = _mm_setr_pd(4, 2), p24 = _mm_shuffle_pd(p42, p42, _MM_SHUFFLE2(0, 1)), du0 = _mm_set1_pd(du[0]), du1 = _mm_set1_pd(du[1]), du2 = _mm_set1_pd(du[2]), dv0 = _mm_set1_pd(dv[0]), dv1 = _mm_set1_pd(dv[1]), dv2 = _mm_set1_pd(dv[2]), jweta = _mm_set1_pd(jw * eta), jwdeta = _mm_set1_pd(jw * deta), dp0 = _mm_set1_pd(dp[0]), dp1 = _mm_set1_pd(dp[1]), dp2 = _mm_set1_pd(dp[2]), dp0jweta = _mm_mul_pd(dp0, jweta), dp1jweta = _mm_mul_pd(dp1, jweta), dp2jweta = _mm_mul_pd(dp2, jweta), p4du0p2dv1 = _mm_add_pd(_mm_mul_pd(p4, du0), _mm_mul_pd(p2, dv1)), /* 4 du0 + 2 dv1 */
1061               p4dv1p2du0 = _mm_add_pd(_mm_mul_pd(p4, dv1), _mm_mul_pd(p2, du0)), /* 4 dv1 + 2 du0 */
1062               pdu2dv2    = _mm_unpacklo_pd(du2, dv2),                            /* [du2, dv2] */
1063               du1pdv0    = _mm_add_pd(du1, dv0),                                 /* du1 + dv0 */
1064               t1         = _mm_mul_pd(dp0, p4du0p2dv1),                          /* dp0 (4 du0 + 2 dv1) */
1065               t2         = _mm_mul_pd(dp1, p4dv1p2du0);                          /* dp1 (4 dv1 + 2 du0) */
1066 
1067 #endif
1068 #if defined COMPUTE_LOWER_TRIANGULAR      /* The element matrices are always symmetric so computing the lower-triangular part is not necessary */
1069             for (ll = ls; ll < 8; ll++) { /* trial functions */
1070 #else
1071             for (ll = l; ll < 8; ll++) {
1072 #endif
1073               const PetscReal *PETSC_RESTRICT dpl = dphi[ll];
1074               if (amode == THIASSEMBLY_TRIDIAGONAL && (l - ll) % 4) continue; /* these entries would not be inserted */
1075 #if !USE_SSE2_KERNELS
1076               /* The analytic Jacobian in nice, easy-to-read form */
1077               {
1078                 PetscScalar dgdu, dgdv;
1079                 dgdu = 2. * du[0] * dpl[0] + dv[1] * dpl[0] + 0.5 * (du[1] + dv[0]) * dpl[1] + 0.5 * du[2] * dpl[2];
1080                 dgdv = 2. * dv[1] * dpl[1] + du[0] * dpl[1] + 0.5 * (du[1] + dv[0]) * dpl[0] + 0.5 * dv[2] * dpl[2];
1081                 /* Picard part */
1082                 Ke[l * 2 + 0][ll * 2 + 0] += dp[0] * jw * eta * 4. * dpl[0] + dp[1] * jw * eta * dpl[1] + dp[2] * jw * eta * dpl[2];
1083                 Ke[l * 2 + 0][ll * 2 + 1] += dp[0] * jw * eta * 2. * dpl[1] + dp[1] * jw * eta * dpl[0];
1084                 Ke[l * 2 + 1][ll * 2 + 0] += dp[1] * jw * eta * 2. * dpl[0] + dp[0] * jw * eta * dpl[1];
1085                 Ke[l * 2 + 1][ll * 2 + 1] += dp[1] * jw * eta * 4. * dpl[1] + dp[0] * jw * eta * dpl[0] + dp[2] * jw * eta * dpl[2];
1086                 /* extra Newton terms */
1087                 Ke[l * 2 + 0][ll * 2 + 0] += dp[0] * jw * deta * dgdu * (4. * du[0] + 2. * dv[1]) + dp[1] * jw * deta * dgdu * (du[1] + dv[0]) + dp[2] * jw * deta * dgdu * du[2];
1088                 Ke[l * 2 + 0][ll * 2 + 1] += dp[0] * jw * deta * dgdv * (4. * du[0] + 2. * dv[1]) + dp[1] * jw * deta * dgdv * (du[1] + dv[0]) + dp[2] * jw * deta * dgdv * du[2];
1089                 Ke[l * 2 + 1][ll * 2 + 0] += dp[1] * jw * deta * dgdu * (4. * dv[1] + 2. * du[0]) + dp[0] * jw * deta * dgdu * (du[1] + dv[0]) + dp[2] * jw * deta * dgdu * dv[2];
1090                 Ke[l * 2 + 1][ll * 2 + 1] += dp[1] * jw * deta * dgdv * (4. * dv[1] + 2. * du[0]) + dp[0] * jw * deta * dgdv * (du[1] + dv[0]) + dp[2] * jw * deta * dgdv * dv[2];
1091               }
1092 #else
1093               /* This SSE2 code is an exact replica of above, but uses explicit packed instructions for some speed
1094               * benefit.  On my hardware, these intrinsics are almost twice as fast as above, reducing total assembly cost
1095               * by 25 to 30 percent. */
1096               {
1097                 __m128d keu = _mm_loadu_pd(&Ke[l * 2 + 0][ll * 2 + 0]), kev = _mm_loadu_pd(&Ke[l * 2 + 1][ll * 2 + 0]), dpl01 = _mm_loadu_pd(&dpl[0]), dpl10 = _mm_shuffle_pd(dpl01, dpl01, _MM_SHUFFLE2(0, 1)), dpl2 = _mm_set_sd(dpl[2]), t0, t3, pdgduv;
1098                 keu    = _mm_add_pd(keu, _mm_add_pd(_mm_mul_pd(_mm_mul_pd(dp0jweta, p42), dpl01), _mm_add_pd(_mm_mul_pd(dp1jweta, dpl10), _mm_mul_pd(dp2jweta, dpl2))));
1099                 kev    = _mm_add_pd(kev, _mm_add_pd(_mm_mul_pd(_mm_mul_pd(dp1jweta, p24), dpl01), _mm_add_pd(_mm_mul_pd(dp0jweta, dpl10), _mm_mul_pd(dp2jweta, _mm_shuffle_pd(dpl2, dpl2, _MM_SHUFFLE2(0, 1))))));
1100                 pdgduv = _mm_mul_pd(p05, _mm_add_pd(_mm_add_pd(_mm_mul_pd(p42, _mm_mul_pd(du0, dpl01)), _mm_mul_pd(p24, _mm_mul_pd(dv1, dpl01))), _mm_add_pd(_mm_mul_pd(du1pdv0, dpl10), _mm_mul_pd(pdu2dv2, _mm_set1_pd(dpl[2]))))); /* [dgdu, dgdv] */
1101                 t0     = _mm_mul_pd(jwdeta, pdgduv); /* jw deta [dgdu, dgdv] */
1102                 t3     = _mm_mul_pd(t0, du1pdv0);    /* t0 (du1 + dv0) */
1103                 _mm_storeu_pd(&Ke[l * 2 + 0][ll * 2 + 0], _mm_add_pd(keu, _mm_add_pd(_mm_mul_pd(t1, t0), _mm_add_pd(_mm_mul_pd(dp1, t3), _mm_mul_pd(t0, _mm_mul_pd(dp2, du2))))));
1104                 _mm_storeu_pd(&Ke[l * 2 + 1][ll * 2 + 0], _mm_add_pd(kev, _mm_add_pd(_mm_mul_pd(t2, t0), _mm_add_pd(_mm_mul_pd(dp0, t3), _mm_mul_pd(t0, _mm_mul_pd(dp2, dv2))))));
1105               }
1106 #endif
1107             }
1108           }
1109         }
1110         if (k == 0) { /* on a bottom face */
1111           if (thi->no_slip) {
1112             const PetscReal   hz    = PetscRealPart(pn[0].h) / (zm - 1);
1113             const PetscScalar diagu = 2 * etabase / thi->rhog * (hx * hy / hz + hx * hz / hy + 4 * hy * hz / hx), diagv = 2 * etabase / thi->rhog * (hx * hy / hz + 4 * hx * hz / hy + hy * hz / hx);
1114             Ke[0][0] = thi->dirichlet_scale * diagu;
1115             Ke[1][1] = thi->dirichlet_scale * diagv;
1116           } else {
1117             for (q = 0; q < 4; q++) {
1118               const PetscReal jw = 0.25 * hx * hy / thi->rhog, *phi = QuadQInterp[q];
1119               PetscScalar     u = 0, v = 0, rbeta2 = 0;
1120               PetscReal       beta2, dbeta2;
1121               for (l = 0; l < 4; l++) {
1122                 u += phi[l] * n[l].u;
1123                 v += phi[l] * n[l].v;
1124                 rbeta2 += phi[l] * pn[l].beta2;
1125               }
1126               THIFriction(thi, PetscRealPart(rbeta2), PetscRealPart(u * u + v * v) / 2, &beta2, &dbeta2);
1127               for (l = 0; l < 4; l++) {
1128                 const PetscReal pp = phi[l];
1129                 for (ll = 0; ll < 4; ll++) {
1130                   const PetscReal ppl = phi[ll];
1131                   Ke[l * 2 + 0][ll * 2 + 0] += pp * jw * beta2 * ppl + pp * jw * dbeta2 * u * u * ppl;
1132                   Ke[l * 2 + 0][ll * 2 + 1] += pp * jw * dbeta2 * u * v * ppl;
1133                   Ke[l * 2 + 1][ll * 2 + 0] += pp * jw * dbeta2 * v * u * ppl;
1134                   Ke[l * 2 + 1][ll * 2 + 1] += pp * jw * beta2 * ppl + pp * jw * dbeta2 * v * v * ppl;
1135                 }
1136               }
1137             }
1138           }
1139         }
1140         {
1141           const MatStencil rc[8] = {
1142             {i,     j,     k,     0},
1143             {i + 1, j,     k,     0},
1144             {i + 1, j + 1, k,     0},
1145             {i,     j + 1, k,     0},
1146             {i,     j,     k + 1, 0},
1147             {i + 1, j,     k + 1, 0},
1148             {i + 1, j + 1, k + 1, 0},
1149             {i,     j + 1, k + 1, 0}
1150           };
1151           if (amode == THIASSEMBLY_TRIDIAGONAL) {
1152             for (l = 0; l < 4; l++) { /* Copy out each of the blocks, discarding horizontal coupling */
1153               const PetscInt   l4     = l + 4;
1154               const MatStencil rcl[2] = {
1155                 {rc[l].k,  rc[l].j,  rc[l].i,  0},
1156                 {rc[l4].k, rc[l4].j, rc[l4].i, 0}
1157               };
1158 #if defined COMPUTE_LOWER_TRIANGULAR
1159               const PetscScalar Kel[4][4] = {
1160                 {Ke[2 * l + 0][2 * l + 0],  Ke[2 * l + 0][2 * l + 1],  Ke[2 * l + 0][2 * l4 + 0],  Ke[2 * l + 0][2 * l4 + 1] },
1161                 {Ke[2 * l + 1][2 * l + 0],  Ke[2 * l + 1][2 * l + 1],  Ke[2 * l + 1][2 * l4 + 0],  Ke[2 * l + 1][2 * l4 + 1] },
1162                 {Ke[2 * l4 + 0][2 * l + 0], Ke[2 * l4 + 0][2 * l + 1], Ke[2 * l4 + 0][2 * l4 + 0], Ke[2 * l4 + 0][2 * l4 + 1]},
1163                 {Ke[2 * l4 + 1][2 * l + 0], Ke[2 * l4 + 1][2 * l + 1], Ke[2 * l4 + 1][2 * l4 + 0], Ke[2 * l4 + 1][2 * l4 + 1]}
1164               };
1165 #else
1166               /* Same as above except for the lower-left block */
1167               const PetscScalar Kel[4][4] = {
1168                 {Ke[2 * l + 0][2 * l + 0],  Ke[2 * l + 0][2 * l + 1],  Ke[2 * l + 0][2 * l4 + 0],  Ke[2 * l + 0][2 * l4 + 1] },
1169                 {Ke[2 * l + 1][2 * l + 0],  Ke[2 * l + 1][2 * l + 1],  Ke[2 * l + 1][2 * l4 + 0],  Ke[2 * l + 1][2 * l4 + 1] },
1170                 {Ke[2 * l + 0][2 * l4 + 0], Ke[2 * l + 1][2 * l4 + 0], Ke[2 * l4 + 0][2 * l4 + 0], Ke[2 * l4 + 0][2 * l4 + 1]},
1171                 {Ke[2 * l + 0][2 * l4 + 1], Ke[2 * l + 1][2 * l4 + 1], Ke[2 * l4 + 1][2 * l4 + 0], Ke[2 * l4 + 1][2 * l4 + 1]}
1172               };
1173 #endif
1174               PetscCall(MatSetValuesBlockedStencil(B, 2, rcl, 2, rcl, &Kel[0][0], ADD_VALUES));
1175             }
1176           } else {
1177 #if !defined COMPUTE_LOWER_TRIANGULAR /* fill in lower-triangular part, this is really cheap compared to computing the entries */
1178             for (l = 0; l < 8; l++) {
1179               for (ll = l + 1; ll < 8; ll++) {
1180                 Ke[ll * 2 + 0][l * 2 + 0] = Ke[l * 2 + 0][ll * 2 + 0];
1181                 Ke[ll * 2 + 1][l * 2 + 0] = Ke[l * 2 + 0][ll * 2 + 1];
1182                 Ke[ll * 2 + 0][l * 2 + 1] = Ke[l * 2 + 1][ll * 2 + 0];
1183                 Ke[ll * 2 + 1][l * 2 + 1] = Ke[l * 2 + 1][ll * 2 + 1];
1184               }
1185             }
1186 #endif
1187             PetscCall(MatSetValuesBlockedStencil(B, 8, rc, 8, rc, &Ke[0][0], ADD_VALUES));
1188           }
1189         }
1190       }
1191     }
1192   }
1193   PetscCall(THIDARestorePrm(info->da, &prm));
1194 
1195   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1196   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1197   PetscCall(MatSetOption(B, MAT_SYMMETRIC, PETSC_TRUE));
1198   if (thi->verbose) PetscCall(THIMatrixStatistics(thi, B, PETSC_VIEWER_STDOUT_WORLD));
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 static PetscErrorCode THIJacobianLocal_3D_Full(DMDALocalInfo *info, Node ***x, Mat A, Mat B, THI thi) {
1203   PetscFunctionBeginUser;
1204   PetscCall(THIJacobianLocal_3D(info, x, B, thi, THIASSEMBLY_FULL));
1205   PetscFunctionReturn(0);
1206 }
1207 
1208 static PetscErrorCode THIJacobianLocal_3D_Tridiagonal(DMDALocalInfo *info, Node ***x, Mat A, Mat B, THI thi) {
1209   PetscFunctionBeginUser;
1210   PetscCall(THIJacobianLocal_3D(info, x, B, thi, THIASSEMBLY_TRIDIAGONAL));
1211   PetscFunctionReturn(0);
1212 }
1213 
1214 static PetscErrorCode DMRefineHierarchy_THI(DM dac0, PetscInt nlevels, DM hierarchy[]) {
1215   THI             thi;
1216   PetscInt        dim, M, N, m, n, s, dof;
1217   DM              dac, daf;
1218   DMDAStencilType st;
1219   DM_DA          *ddf, *ddc;
1220 
1221   PetscFunctionBeginUser;
1222   PetscCall(PetscObjectQuery((PetscObject)dac0, "THI", (PetscObject *)&thi));
1223   PetscCheck(thi, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot refine this DMDA, missing composed THI instance");
1224   if (nlevels > 1) {
1225     PetscCall(DMRefineHierarchy(dac0, nlevels - 1, hierarchy));
1226     dac = hierarchy[nlevels - 2];
1227   } else {
1228     dac = dac0;
1229   }
1230   PetscCall(DMDAGetInfo(dac, &dim, &N, &M, 0, &n, &m, 0, &dof, &s, 0, 0, 0, &st));
1231   PetscCheck(dim == 2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "This function can only refine 2D DMDAs");
1232 
1233   /* Creates a 3D DMDA with the same map-plane layout as the 2D one, with contiguous columns */
1234   PetscCall(DMDACreate3d(PetscObjectComm((PetscObject)dac), DM_BOUNDARY_NONE, DM_BOUNDARY_PERIODIC, DM_BOUNDARY_PERIODIC, st, thi->zlevels, N, M, 1, n, m, dof, s, NULL, NULL, NULL, &daf));
1235   PetscCall(DMSetUp(daf));
1236 
1237   daf->ops->creatematrix        = dac->ops->creatematrix;
1238   daf->ops->createinterpolation = dac->ops->createinterpolation;
1239   daf->ops->getcoloring         = dac->ops->getcoloring;
1240   ddf                           = (DM_DA *)daf->data;
1241   ddc                           = (DM_DA *)dac->data;
1242   ddf->interptype               = ddc->interptype;
1243 
1244   PetscCall(DMDASetFieldName(daf, 0, "x-velocity"));
1245   PetscCall(DMDASetFieldName(daf, 1, "y-velocity"));
1246 
1247   hierarchy[nlevels - 1] = daf;
1248   PetscFunctionReturn(0);
1249 }
1250 
1251 static PetscErrorCode DMCreateInterpolation_DA_THI(DM dac, DM daf, Mat *A, Vec *scale) {
1252   PetscInt dim;
1253 
1254   PetscFunctionBeginUser;
1255   PetscValidHeaderSpecific(dac, DM_CLASSID, 1);
1256   PetscValidHeaderSpecific(daf, DM_CLASSID, 2);
1257   PetscValidPointer(A, 3);
1258   if (scale) PetscValidPointer(scale, 4);
1259   PetscCall(DMDAGetInfo(daf, &dim, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
1260   if (dim == 2) {
1261     /* We are in the 2D problem and use normal DMDA interpolation */
1262     PetscCall(DMCreateInterpolation(dac, daf, A, scale));
1263   } else {
1264     PetscInt i, j, k, xs, ys, zs, xm, ym, zm, mx, my, mz, rstart, cstart;
1265     Mat      B;
1266 
1267     PetscCall(DMDAGetInfo(daf, 0, &mz, &my, &mx, 0, 0, 0, 0, 0, 0, 0, 0, 0));
1268     PetscCall(DMDAGetCorners(daf, &zs, &ys, &xs, &zm, &ym, &xm));
1269     PetscCheck(!zs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "unexpected");
1270     PetscCall(MatCreate(PetscObjectComm((PetscObject)daf), &B));
1271     PetscCall(MatSetSizes(B, xm * ym * zm, xm * ym, mx * my * mz, mx * my));
1272 
1273     PetscCall(MatSetType(B, MATAIJ));
1274     PetscCall(MatSeqAIJSetPreallocation(B, 1, NULL));
1275     PetscCall(MatMPIAIJSetPreallocation(B, 1, NULL, 0, NULL));
1276     PetscCall(MatGetOwnershipRange(B, &rstart, NULL));
1277     PetscCall(MatGetOwnershipRangeColumn(B, &cstart, NULL));
1278     for (i = xs; i < xs + xm; i++) {
1279       for (j = ys; j < ys + ym; j++) {
1280         for (k = zs; k < zs + zm; k++) {
1281           PetscInt    i2 = i * ym + j, i3 = i2 * zm + k;
1282           PetscScalar val = ((k == 0 || k == mz - 1) ? 0.5 : 1.) / (mz - 1.); /* Integration using trapezoid rule */
1283           PetscCall(MatSetValue(B, cstart + i3, rstart + i2, val, INSERT_VALUES));
1284         }
1285       }
1286     }
1287     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1288     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1289     PetscCall(MatCreateMAIJ(B, sizeof(Node) / sizeof(PetscScalar), A));
1290     PetscCall(MatDestroy(&B));
1291   }
1292   PetscFunctionReturn(0);
1293 }
1294 
1295 static PetscErrorCode DMCreateMatrix_THI_Tridiagonal(DM da, Mat *J) {
1296   Mat                    A;
1297   PetscInt               xm, ym, zm, dim, dof = 2, starts[3], dims[3];
1298   ISLocalToGlobalMapping ltog;
1299 
1300   PetscFunctionBeginUser;
1301   PetscCall(DMDAGetInfo(da, &dim, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
1302   PetscCheck(dim == 3, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Expected DMDA to be 3D");
1303   PetscCall(DMDAGetCorners(da, 0, 0, 0, &zm, &ym, &xm));
1304   PetscCall(DMGetLocalToGlobalMapping(da, &ltog));
1305   PetscCall(MatCreate(PetscObjectComm((PetscObject)da), &A));
1306   PetscCall(MatSetSizes(A, dof * xm * ym * zm, dof * xm * ym * zm, PETSC_DETERMINE, PETSC_DETERMINE));
1307   PetscCall(MatSetType(A, da->mattype));
1308   PetscCall(MatSetFromOptions(A));
1309   PetscCall(MatSeqAIJSetPreallocation(A, 3 * 2, NULL));
1310   PetscCall(MatMPIAIJSetPreallocation(A, 3 * 2, NULL, 0, NULL));
1311   PetscCall(MatSeqBAIJSetPreallocation(A, 2, 3, NULL));
1312   PetscCall(MatMPIBAIJSetPreallocation(A, 2, 3, NULL, 0, NULL));
1313   PetscCall(MatSeqSBAIJSetPreallocation(A, 2, 2, NULL));
1314   PetscCall(MatMPISBAIJSetPreallocation(A, 2, 2, NULL, 0, NULL));
1315   PetscCall(MatSetLocalToGlobalMapping(A, ltog, ltog));
1316   PetscCall(DMDAGetGhostCorners(da, &starts[0], &starts[1], &starts[2], &dims[0], &dims[1], &dims[2]));
1317   PetscCall(MatSetStencil(A, dim, dims, starts, dof));
1318   *J = A;
1319   PetscFunctionReturn(0);
1320 }
1321 
1322 static PetscErrorCode THIDAVecView_VTK_XML(THI thi, DM da, Vec X, const char filename[]) {
1323   const PetscInt     dof   = 2;
1324   Units              units = thi->units;
1325   MPI_Comm           comm;
1326   PetscViewer        viewer;
1327   PetscMPIInt        rank, size, tag, nn, nmax;
1328   PetscInt           mx, my, mz, r, range[6];
1329   const PetscScalar *x;
1330 
1331   PetscFunctionBeginUser;
1332   PetscCall(PetscObjectGetComm((PetscObject)thi, &comm));
1333   PetscCall(DMDAGetInfo(da, 0, &mz, &my, &mx, 0, 0, 0, 0, 0, 0, 0, 0, 0));
1334   PetscCallMPI(MPI_Comm_size(comm, &size));
1335   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1336   PetscCall(PetscViewerASCIIOpen(comm, filename, &viewer));
1337   PetscCall(PetscViewerASCIIPrintf(viewer, "<VTKFile type=\"StructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n"));
1338   PetscCall(PetscViewerASCIIPrintf(viewer, "  <StructuredGrid WholeExtent=\"%d %" PetscInt_FMT " %d %" PetscInt_FMT " %d %" PetscInt_FMT "\">\n", 0, mz - 1, 0, my - 1, 0, mx - 1));
1339 
1340   PetscCall(DMDAGetCorners(da, range, range + 1, range + 2, range + 3, range + 4, range + 5));
1341   PetscCall(PetscMPIIntCast(range[3] * range[4] * range[5] * dof, &nn));
1342   PetscCallMPI(MPI_Reduce(&nn, &nmax, 1, MPI_INT, MPI_MAX, 0, comm));
1343   tag = ((PetscObject)viewer)->tag;
1344   PetscCall(VecGetArrayRead(X, &x));
1345   if (rank == 0) {
1346     PetscScalar *array;
1347     PetscCall(PetscMalloc1(nmax, &array));
1348     for (r = 0; r < size; r++) {
1349       PetscInt           i, j, k, xs, xm, ys, ym, zs, zm;
1350       const PetscScalar *ptr;
1351       MPI_Status         status;
1352       if (r) PetscCallMPI(MPI_Recv(range, 6, MPIU_INT, r, tag, comm, MPI_STATUS_IGNORE));
1353       zs = range[0];
1354       ys = range[1];
1355       xs = range[2];
1356       zm = range[3];
1357       ym = range[4];
1358       xm = range[5];
1359       PetscCheck(xm * ym * zm * dof <= nmax, PETSC_COMM_SELF, PETSC_ERR_PLIB, "should not happen");
1360       if (r) {
1361         PetscCallMPI(MPI_Recv(array, nmax, MPIU_SCALAR, r, tag, comm, &status));
1362         PetscCallMPI(MPI_Get_count(&status, MPIU_SCALAR, &nn));
1363         PetscCheck(nn == xm * ym * zm * dof, PETSC_COMM_SELF, PETSC_ERR_PLIB, "should not happen");
1364         ptr = array;
1365       } else ptr = x;
1366       PetscCall(PetscViewerASCIIPrintf(viewer, "    <Piece Extent=\"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\">\n", zs, zs + zm - 1, ys, ys + ym - 1, xs, xs + xm - 1));
1367 
1368       PetscCall(PetscViewerASCIIPrintf(viewer, "      <Points>\n"));
1369       PetscCall(PetscViewerASCIIPrintf(viewer, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"ascii\">\n"));
1370       for (i = xs; i < xs + xm; i++) {
1371         for (j = ys; j < ys + ym; j++) {
1372           for (k = zs; k < zs + zm; k++) {
1373             PrmNode   p;
1374             PetscReal xx = thi->Lx * i / mx, yy = thi->Ly * j / my, zz;
1375             thi->initialize(thi, xx, yy, &p);
1376             zz = PetscRealPart(p.b) + PetscRealPart(p.h) * k / (mz - 1);
1377             PetscCall(PetscViewerASCIIPrintf(viewer, "%f %f %f\n", (double)xx, (double)yy, (double)zz));
1378           }
1379         }
1380       }
1381       PetscCall(PetscViewerASCIIPrintf(viewer, "        </DataArray>\n"));
1382       PetscCall(PetscViewerASCIIPrintf(viewer, "      </Points>\n"));
1383 
1384       PetscCall(PetscViewerASCIIPrintf(viewer, "      <PointData>\n"));
1385       PetscCall(PetscViewerASCIIPrintf(viewer, "        <DataArray type=\"Float32\" Name=\"velocity\" NumberOfComponents=\"3\" format=\"ascii\">\n"));
1386       for (i = 0; i < nn; i += dof) PetscCall(PetscViewerASCIIPrintf(viewer, "%f %f %f\n", (double)(PetscRealPart(ptr[i]) * units->year / units->meter), (double)(PetscRealPart(ptr[i + 1]) * units->year / units->meter), 0.0));
1387       PetscCall(PetscViewerASCIIPrintf(viewer, "        </DataArray>\n"));
1388 
1389       PetscCall(PetscViewerASCIIPrintf(viewer, "        <DataArray type=\"Int32\" Name=\"rank\" NumberOfComponents=\"1\" format=\"ascii\">\n"));
1390       for (i = 0; i < nn; i += dof) PetscCall(PetscViewerASCIIPrintf(viewer, "%" PetscInt_FMT "\n", r));
1391       PetscCall(PetscViewerASCIIPrintf(viewer, "        </DataArray>\n"));
1392       PetscCall(PetscViewerASCIIPrintf(viewer, "      </PointData>\n"));
1393 
1394       PetscCall(PetscViewerASCIIPrintf(viewer, "    </Piece>\n"));
1395     }
1396     PetscCall(PetscFree(array));
1397   } else {
1398     PetscCallMPI(MPI_Send(range, 6, MPIU_INT, 0, tag, comm));
1399     PetscCallMPI(MPI_Send((PetscScalar *)x, nn, MPIU_SCALAR, 0, tag, comm));
1400   }
1401   PetscCall(VecRestoreArrayRead(X, &x));
1402   PetscCall(PetscViewerASCIIPrintf(viewer, "  </StructuredGrid>\n"));
1403   PetscCall(PetscViewerASCIIPrintf(viewer, "</VTKFile>\n"));
1404   PetscCall(PetscViewerDestroy(&viewer));
1405   PetscFunctionReturn(0);
1406 }
1407 
1408 int main(int argc, char *argv[]) {
1409   MPI_Comm comm;
1410   THI      thi;
1411   DM       da;
1412   SNES     snes;
1413 
1414   PetscFunctionBeginUser;
1415   PetscCall(PetscInitialize(&argc, &argv, 0, help));
1416   comm = PETSC_COMM_WORLD;
1417 
1418   PetscCall(THICreate(comm, &thi));
1419   {
1420     PetscInt M = 3, N = 3, P = 2;
1421     PetscOptionsBegin(comm, NULL, "Grid resolution options", "");
1422     {
1423       PetscCall(PetscOptionsInt("-M", "Number of elements in x-direction on coarse level", "", M, &M, NULL));
1424       N = M;
1425       PetscCall(PetscOptionsInt("-N", "Number of elements in y-direction on coarse level (if different from M)", "", N, &N, NULL));
1426       if (thi->coarse2d) {
1427         PetscCall(PetscOptionsInt("-zlevels", "Number of elements in z-direction on fine level", "", thi->zlevels, &thi->zlevels, NULL));
1428       } else {
1429         PetscCall(PetscOptionsInt("-P", "Number of elements in z-direction on coarse level", "", P, &P, NULL));
1430       }
1431     }
1432     PetscOptionsEnd();
1433     if (thi->coarse2d) {
1434       PetscCall(DMDACreate2d(comm, DM_BOUNDARY_PERIODIC, DM_BOUNDARY_PERIODIC, DMDA_STENCIL_BOX, N, M, PETSC_DETERMINE, PETSC_DETERMINE, sizeof(Node) / sizeof(PetscScalar), 1, 0, 0, &da));
1435       PetscCall(DMSetFromOptions(da));
1436       PetscCall(DMSetUp(da));
1437       da->ops->refinehierarchy     = DMRefineHierarchy_THI;
1438       da->ops->createinterpolation = DMCreateInterpolation_DA_THI;
1439 
1440       PetscCall(PetscObjectCompose((PetscObject)da, "THI", (PetscObject)thi));
1441     } else {
1442       PetscCall(DMDACreate3d(comm, DM_BOUNDARY_NONE, DM_BOUNDARY_PERIODIC, DM_BOUNDARY_PERIODIC, DMDA_STENCIL_BOX, P, N, M, 1, PETSC_DETERMINE, PETSC_DETERMINE, sizeof(Node) / sizeof(PetscScalar), 1, 0, 0, 0, &da));
1443       PetscCall(DMSetFromOptions(da));
1444       PetscCall(DMSetUp(da));
1445     }
1446     PetscCall(DMDASetFieldName(da, 0, "x-velocity"));
1447     PetscCall(DMDASetFieldName(da, 1, "y-velocity"));
1448   }
1449   PetscCall(THISetUpDM(thi, da));
1450   if (thi->tridiagonal) da->ops->creatematrix = DMCreateMatrix_THI_Tridiagonal;
1451 
1452   { /* Set the fine level matrix type if -da_refine */
1453     PetscInt rlevel, clevel;
1454     PetscCall(DMGetRefineLevel(da, &rlevel));
1455     PetscCall(DMGetCoarsenLevel(da, &clevel));
1456     if (rlevel - clevel > 0) PetscCall(DMSetMatType(da, thi->mattype));
1457   }
1458 
1459   PetscCall(DMDASNESSetFunctionLocal(da, ADD_VALUES, (DMDASNESFunction)THIFunctionLocal, thi));
1460   if (thi->tridiagonal) {
1461     PetscCall(DMDASNESSetJacobianLocal(da, (DMDASNESJacobian)THIJacobianLocal_3D_Tridiagonal, thi));
1462   } else {
1463     PetscCall(DMDASNESSetJacobianLocal(da, (DMDASNESJacobian)THIJacobianLocal_3D_Full, thi));
1464   }
1465   PetscCall(DMCoarsenHookAdd(da, DMCoarsenHook_THI, NULL, thi));
1466   PetscCall(DMRefineHookAdd(da, DMRefineHook_THI, NULL, thi));
1467 
1468   PetscCall(DMSetApplicationContext(da, thi));
1469 
1470   PetscCall(SNESCreate(comm, &snes));
1471   PetscCall(SNESSetDM(snes, da));
1472   PetscCall(DMDestroy(&da));
1473   PetscCall(SNESSetComputeInitialGuess(snes, THIInitial, NULL));
1474   PetscCall(SNESSetFromOptions(snes));
1475 
1476   PetscCall(SNESSolve(snes, NULL, NULL));
1477 
1478   PetscCall(THISolveStatistics(thi, snes, 0, "Full"));
1479 
1480   {
1481     PetscBool flg;
1482     char      filename[PETSC_MAX_PATH_LEN] = "";
1483     PetscCall(PetscOptionsGetString(NULL, NULL, "-o", filename, sizeof(filename), &flg));
1484     if (flg) {
1485       Vec X;
1486       DM  dm;
1487       PetscCall(SNESGetSolution(snes, &X));
1488       PetscCall(SNESGetDM(snes, &dm));
1489       PetscCall(THIDAVecView_VTK_XML(thi, dm, X, filename));
1490     }
1491   }
1492 
1493   PetscCall(DMDestroy(&da));
1494   PetscCall(SNESDestroy(&snes));
1495   PetscCall(THIDestroy(&thi));
1496   PetscCall(PetscFinalize());
1497   return 0;
1498 }
1499 
1500 /*TEST
1501 
1502    build:
1503       requires: !single
1504 
1505    test:
1506       args: -M 6 -P 4 -da_refine 1 -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type sbaij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type icc
1507 
1508    test:
1509       suffix: 2
1510       nsize: 2
1511       args: -M 6 -P 4 -thi_hom z -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type sbaij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type asm -mg_levels_pc_asm_blocks 6 -mg_levels_0_pc_type redundant -snes_grid_sequence 1 -mat_partitioning_type current -ksp_atol -1
1512 
1513    test:
1514       suffix: 3
1515       nsize: 3
1516       args: -M 7 -P 4 -thi_hom z -da_refine 1 -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type baij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_pc_asm_type restrict -mg_levels_pc_type asm -mg_levels_pc_asm_blocks 9 -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mat_partitioning_type current
1517 
1518    test:
1519       suffix: 4
1520       nsize: 6
1521       args: -M 4 -P 2 -da_refine_hierarchy_x 1,1,3 -da_refine_hierarchy_y 2,2,1 -da_refine_hierarchy_z 2,2,1 -snes_grid_sequence 3 -ksp_converged_reason -ksp_type fgmres -ksp_rtol 1e-2 -pc_type mg -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type bjacobi -mg_levels_1_sub_pc_type cholesky -pc_mg_type multiplicative -snes_converged_reason -snes_stol 1e-12 -thi_L 80e3 -thi_alpha 0.05 -thi_friction_m 1 -thi_hom x -snes_view -mg_levels_0_pc_type redundant -mg_levels_0_ksp_type preonly -ksp_atol -1
1522 
1523    test:
1524       suffix: 5
1525       nsize: 6
1526       args: -M 12 -P 5 -snes_monitor_short -ksp_converged_reason -pc_type asm -pc_asm_type restrict -dm_mat_type {{aij baij sbaij}}
1527 
1528 TEST*/
1529