xref: /petsc/src/snes/tutorials/ex48.c (revision 9d47de495d3c23378050c1b4a410c12a375cb6c6)
1 static const char help[] = "Toy hydrostatic ice flow with multigrid in 3D.\n\
2 \n\
3 Solves the hydrostatic (aka Blatter/Pattyn/First Order) equations for ice sheet flow\n\
4 using multigrid.  The ice uses a power-law rheology with \"Glen\" exponent 3 (corresponds\n\
5 to p=4/3 in a p-Laplacian).  The focus is on ISMIP-HOM experiments which assume periodic\n\
6 boundary conditions in the x- and y-directions.\n\
7 \n\
8 Equations are rescaled so that the domain size and solution are O(1), details of this scaling\n\
9 can be controlled by the options -units_meter, -units_second, and -units_kilogram.\n\
10 \n\
11 A VTK StructuredGrid output file can be written using the option -o filename.vts\n\
12 \n\n";
13 
14 /*
15 The equations for horizontal velocity (u,v) are
16 
17   - [eta (4 u_x + 2 v_y)]_x - [eta (u_y + v_x)]_y - [eta u_z]_z + rho g s_x = 0
18   - [eta (4 v_y + 2 u_x)]_y - [eta (u_y + v_x)]_x - [eta v_z]_z + rho g s_y = 0
19 
20 where
21 
22   eta = B/2 (epsilon + gamma)^((p-2)/2)
23 
24 is the nonlinear effective viscosity with regularization epsilon and hardness parameter B,
25 written in terms of the second invariant
26 
27   gamma = u_x^2 + v_y^2 + u_x v_y + (1/4) (u_y + v_x)^2 + (1/4) u_z^2 + (1/4) v_z^2
28 
29 The surface boundary conditions are the natural conditions.  The basal boundary conditions
30 are either no-slip, or Navier (linear) slip with spatially variant friction coefficient beta^2.
31 
32 In the code, the equations for (u,v) are multiplied through by 1/(rho g) so that residuals are O(1).
33 
34 The discretization is Q1 finite elements, managed by a DMDA.  The grid is never distorted in the
35 map (x,y) plane, but the bed and surface may be bumpy.  This is handled as usual in FEM, through
36 the Jacobian of the coordinate transformation from a reference element to the physical element.
37 
38 Since ice-flow is tightly coupled in the z-direction (within columns), the DMDA is managed
39 specially so that columns are never distributed, and are always contiguous in memory.
40 This amounts to reversing the meaning of X,Y,Z compared to the DMDA's internal interpretation,
41 and then indexing as vec[i][j][k].  The exotic coarse spaces require 2D DMDAs which are made to
42 use compatible domain decomposition relative to the 3D DMDAs.
43 
44 There are two compile-time options:
45 
46   NO_SSE2:
47     If the host supports SSE2, we use integration code that has been vectorized with SSE2
48     intrinsics, unless this macro is defined.  The intrinsics speed up integration by about
49     30% on my architecture (P8700, gcc-4.5 snapshot).
50 
51   COMPUTE_LOWER_TRIANGULAR:
52     The element matrices we assemble are lower-triangular so it is not necessary to compute
53     all entries explicitly.  If this macro is defined, the lower-triangular entries are
54     computed explicitly.
55 
56 */
57 
58 #if defined(PETSC_APPLE_FRAMEWORK)
59   #import <PETSc/petscsnes.h>
60   #import <PETSc/petsc/private/dmdaimpl.h> /* There is not yet a public interface to manipulate dm->ops */
61 #else
62 
63   #include <petscsnes.h>
64   #include <petsc/private/dmdaimpl.h> /* There is not yet a public interface to manipulate dm->ops */
65 #endif
66 #include <ctype.h> /* toupper() */
67 
68 #if defined(__cplusplus) || defined(PETSC_HAVE_WINDOWS_COMPILERS) || defined(__PGI)
69   /*  c++ cannot handle  [_restrict_] notation like C does */
70   #undef PETSC_RESTRICT
71   #define PETSC_RESTRICT
72 #endif
73 
74 #if defined(__SSE2__)
75   #include <emmintrin.h>
76 #endif
77 
78 /* The SSE2 kernels are only for PetscScalar=double on architectures that support it */
79 #if !defined(NO_SSE2) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_REAL_SINGLE) && !defined(PETSC_USE_REAL___FLOAT128) && !defined(PETSC_USE_REAL___FP16) && defined(__SSE2__)
80   #define USE_SSE2_KERNELS 1
81 #else
82   #define USE_SSE2_KERNELS 0
83 #endif
84 
85 static PetscClassId THI_CLASSID;
86 
87 typedef enum {
88   QUAD_GAUSS,
89   QUAD_LOBATTO
90 } QuadratureType;
91 static const char                  *QuadratureTypes[] = {"gauss", "lobatto", "QuadratureType", "QUAD_", 0};
92 PETSC_UNUSED static const PetscReal HexQWeights[8]    = {1, 1, 1, 1, 1, 1, 1, 1};
93 PETSC_UNUSED static const PetscReal HexQNodes[]       = {-0.57735026918962573, 0.57735026918962573};
94 #define G 0.57735026918962573
95 #define H (0.5 * (1. + G))
96 #define L (0.5 * (1. - G))
97 #define M (-0.5)
98 #define P (0.5)
99 /* Special quadrature: Lobatto in horizontal, Gauss in vertical */
100 static const PetscReal HexQInterp_Lobatto[8][8] = {
101   {H, 0, 0, 0, L, 0, 0, 0},
102   {0, H, 0, 0, 0, L, 0, 0},
103   {0, 0, H, 0, 0, 0, L, 0},
104   {0, 0, 0, H, 0, 0, 0, L},
105   {L, 0, 0, 0, H, 0, 0, 0},
106   {0, L, 0, 0, 0, H, 0, 0},
107   {0, 0, L, 0, 0, 0, H, 0},
108   {0, 0, 0, L, 0, 0, 0, H}
109 };
110 static const PetscReal HexQDeriv_Lobatto[8][8][3] = {
111   {{M * H, M *H, M}, {P * H, 0, 0},    {0, 0, 0},        {0, P *H, 0},     {M * L, M *L, P}, {P * L, 0, 0},    {0, 0, 0},        {0, P *L, 0}    },
112   {{M * H, 0, 0},    {P * H, M *H, M}, {0, P *H, 0},     {0, 0, 0},        {M * L, 0, 0},    {P * L, M *L, P}, {0, P *L, 0},     {0, 0, 0}       },
113   {{0, 0, 0},        {0, M *H, 0},     {P * H, P *H, M}, {M * H, 0, 0},    {0, 0, 0},        {0, M *L, 0},     {P * L, P *L, P}, {M * L, 0, 0}   },
114   {{0, M *H, 0},     {0, 0, 0},        {P * H, 0, 0},    {M * H, P *H, M}, {0, M *L, 0},     {0, 0, 0},        {P * L, 0, 0},    {M * L, P *L, P}},
115   {{M * L, M *L, M}, {P * L, 0, 0},    {0, 0, 0},        {0, P *L, 0},     {M * H, M *H, P}, {P * H, 0, 0},    {0, 0, 0},        {0, P *H, 0}    },
116   {{M * L, 0, 0},    {P * L, M *L, M}, {0, P *L, 0},     {0, 0, 0},        {M * H, 0, 0},    {P * H, M *H, P}, {0, P *H, 0},     {0, 0, 0}       },
117   {{0, 0, 0},        {0, M *L, 0},     {P * L, P *L, M}, {M * L, 0, 0},    {0, 0, 0},        {0, M *H, 0},     {P * H, P *H, P}, {M * H, 0, 0}   },
118   {{0, M *L, 0},     {0, 0, 0},        {P * L, 0, 0},    {M * L, P *L, M}, {0, M *H, 0},     {0, 0, 0},        {P * H, 0, 0},    {M * H, P *H, P}}
119 };
120 /* Stanndard Gauss */
121 static const PetscReal HexQInterp_Gauss[8][8] = {
122   {H * H * H, L * H * H, L * L * H, H * L * H, H * H * L, L * H * L, L * L * L, H * L * L},
123   {L * H * H, H * H * H, H * L * H, L * L * H, L * H * L, H * H * L, H * L * L, L * L * L},
124   {L * L * H, H * L * H, H * H * H, L * H * H, L * L * L, H * L * L, H * H * L, L * H * L},
125   {H * L * H, L * L * H, L * H * H, H * H * H, H * L * L, L * L * L, L * H * L, H * H * L},
126   {H * H * L, L * H * L, L * L * L, H * L * L, H * H * H, L * H * H, L * L * H, H * L * H},
127   {L * H * L, H * H * L, H * L * L, L * L * L, L * H * H, H * H * H, H * L * H, L * L * H},
128   {L * L * L, H * L * L, H * H * L, L * H * L, L * L * H, H * L * H, H * H * H, L * H * H},
129   {H * L * L, L * L * L, L * H * L, H * H * L, H * L * H, L * L * H, L * H * H, H * H * H}
130 };
131 static const PetscReal HexQDeriv_Gauss[8][8][3] = {
132   {{M * H * H, H * M * H, H * H * M}, {P * H * H, L * M * H, L * H * M}, {P * L * H, L * P * H, L * L * M}, {M * L * H, H * P * H, H * L * M}, {M * H * L, H * M * L, H * H * P}, {P * H * L, L * M * L, L * H * P}, {P * L * L, L * P * L, L * L * P}, {M * L * L, H * P * L, H * L * P}},
133   {{M * H * H, L * M * H, L * H * M}, {P * H * H, H * M * H, H * H * M}, {P * L * H, H * P * H, H * L * M}, {M * L * H, L * P * H, L * L * M}, {M * H * L, L * M * L, L * H * P}, {P * H * L, H * M * L, H * H * P}, {P * L * L, H * P * L, H * L * P}, {M * L * L, L * P * L, L * L * P}},
134   {{M * L * H, L * M * H, L * L * M}, {P * L * H, H * M * H, H * L * M}, {P * H * H, H * P * H, H * H * M}, {M * H * H, L * P * H, L * H * M}, {M * L * L, L * M * L, L * L * P}, {P * L * L, H * M * L, H * L * P}, {P * H * L, H * P * L, H * H * P}, {M * H * L, L * P * L, L * H * P}},
135   {{M * L * H, H * M * H, H * L * M}, {P * L * H, L * M * H, L * L * M}, {P * H * H, L * P * H, L * H * M}, {M * H * H, H * P * H, H * H * M}, {M * L * L, H * M * L, H * L * P}, {P * L * L, L * M * L, L * L * P}, {P * H * L, L * P * L, L * H * P}, {M * H * L, H * P * L, H * H * P}},
136   {{M * H * L, H * M * L, H * H * M}, {P * H * L, L * M * L, L * H * M}, {P * L * L, L * P * L, L * L * M}, {M * L * L, H * P * L, H * L * M}, {M * H * H, H * M * H, H * H * P}, {P * H * H, L * M * H, L * H * P}, {P * L * H, L * P * H, L * L * P}, {M * L * H, H * P * H, H * L * P}},
137   {{M * H * L, L * M * L, L * H * M}, {P * H * L, H * M * L, H * H * M}, {P * L * L, H * P * L, H * L * M}, {M * L * L, L * P * L, L * L * M}, {M * H * H, L * M * H, L * H * P}, {P * H * H, H * M * H, H * H * P}, {P * L * H, H * P * H, H * L * P}, {M * L * H, L * P * H, L * L * P}},
138   {{M * L * L, L * M * L, L * L * M}, {P * L * L, H * M * L, H * L * M}, {P * H * L, H * P * L, H * H * M}, {M * H * L, L * P * L, L * H * M}, {M * L * H, L * M * H, L * L * P}, {P * L * H, H * M * H, H * L * P}, {P * H * H, H * P * H, H * H * P}, {M * H * H, L * P * H, L * H * P}},
139   {{M * L * L, H * M * L, H * L * M}, {P * L * L, L * M * L, L * L * M}, {P * H * L, L * P * L, L * H * M}, {M * H * L, H * P * L, H * H * M}, {M * L * H, H * M * H, H * L * P}, {P * L * H, L * M * H, L * L * P}, {P * H * H, L * P * H, L * H * P}, {M * H * H, H * P * H, H * H * P}}
140 };
141 static const PetscReal (*HexQInterp)[8], (*HexQDeriv)[8][3];
142 /* Standard 2x2 Gauss quadrature for the bottom layer. */
143 static const PetscReal QuadQInterp[4][4] = {
144   {H * H, L *H, L *L, H *L},
145   {L * H, H *H, H *L, L *L},
146   {L * L, H *L, H *H, L *H},
147   {H * L, L *L, L *H, H *H}
148 };
149 static const PetscReal QuadQDeriv[4][4][2] = {
150   {{M * H, M *H}, {P * H, M *L}, {P * L, P *L}, {M * L, P *H}},
151   {{M * H, M *L}, {P * H, M *H}, {P * L, P *H}, {M * L, P *L}},
152   {{M * L, M *L}, {P * L, M *H}, {P * H, P *H}, {M * H, P *L}},
153   {{M * L, M *H}, {P * L, M *L}, {P * H, P *L}, {M * H, P *H}}
154 };
155 #undef G
156 #undef H
157 #undef L
158 #undef M
159 #undef P
160 
161 #define HexExtract(x, i, j, k, n) \
162   do { \
163     (n)[0] = (x)[i][j][k]; \
164     (n)[1] = (x)[i + 1][j][k]; \
165     (n)[2] = (x)[i + 1][j + 1][k]; \
166     (n)[3] = (x)[i][j + 1][k]; \
167     (n)[4] = (x)[i][j][k + 1]; \
168     (n)[5] = (x)[i + 1][j][k + 1]; \
169     (n)[6] = (x)[i + 1][j + 1][k + 1]; \
170     (n)[7] = (x)[i][j + 1][k + 1]; \
171   } while (0)
172 
173 #define HexExtractRef(x, i, j, k, n) \
174   do { \
175     (n)[0] = &(x)[i][j][k]; \
176     (n)[1] = &(x)[i + 1][j][k]; \
177     (n)[2] = &(x)[i + 1][j + 1][k]; \
178     (n)[3] = &(x)[i][j + 1][k]; \
179     (n)[4] = &(x)[i][j][k + 1]; \
180     (n)[5] = &(x)[i + 1][j][k + 1]; \
181     (n)[6] = &(x)[i + 1][j + 1][k + 1]; \
182     (n)[7] = &(x)[i][j + 1][k + 1]; \
183   } while (0)
184 
185 #define QuadExtract(x, i, j, n) \
186   do { \
187     (n)[0] = (x)[i][j]; \
188     (n)[1] = (x)[i + 1][j]; \
189     (n)[2] = (x)[i + 1][j + 1]; \
190     (n)[3] = (x)[i][j + 1]; \
191   } while (0)
192 
HexGrad(const PetscReal dphi[][3],const PetscReal zn[],PetscReal dz[])193 static void HexGrad(const PetscReal dphi[][3], const PetscReal zn[], PetscReal dz[])
194 {
195   PetscInt i;
196   dz[0] = dz[1] = dz[2] = 0;
197   for (i = 0; i < 8; i++) {
198     dz[0] += dphi[i][0] * zn[i];
199     dz[1] += dphi[i][1] * zn[i];
200     dz[2] += dphi[i][2] * zn[i];
201   }
202 }
203 
HexComputeGeometry(PetscInt q,PetscReal hx,PetscReal hy,const PetscReal dz[PETSC_RESTRICT],PetscReal phi[PETSC_RESTRICT],PetscReal dphi[PETSC_RESTRICT][3],PetscReal * PETSC_RESTRICT jw)204 static void HexComputeGeometry(PetscInt q, PetscReal hx, PetscReal hy, const PetscReal dz[PETSC_RESTRICT], PetscReal phi[PETSC_RESTRICT], PetscReal dphi[PETSC_RESTRICT][3], PetscReal *PETSC_RESTRICT jw)
205 {
206   const PetscReal jac[3][3] = {
207     {hx / 2, 0,      0    },
208     {0,      hy / 2, 0    },
209     {dz[0],  dz[1],  dz[2]}
210   };
211   const PetscReal ijac[3][3] = {
212     {1 / jac[0][0],                        0,                                    0            },
213     {0,                                    1 / jac[1][1],                        0            },
214     {-jac[2][0] / (jac[0][0] * jac[2][2]), -jac[2][1] / (jac[1][1] * jac[2][2]), 1 / jac[2][2]}
215   };
216   const PetscReal jdet = jac[0][0] * jac[1][1] * jac[2][2];
217   PetscInt        i;
218 
219   for (i = 0; i < 8; i++) {
220     const PetscReal *dphir = HexQDeriv[q][i];
221     phi[i]                 = HexQInterp[q][i];
222     dphi[i][0]             = dphir[0] * ijac[0][0] + dphir[1] * ijac[1][0] + dphir[2] * ijac[2][0];
223     dphi[i][1]             = dphir[0] * ijac[0][1] + dphir[1] * ijac[1][1] + dphir[2] * ijac[2][1];
224     dphi[i][2]             = dphir[0] * ijac[0][2] + dphir[1] * ijac[1][2] + dphir[2] * ijac[2][2];
225   }
226   *jw = 1.0 * jdet;
227 }
228 
229 typedef struct _p_THI   *THI;
230 typedef struct _n_Units *Units;
231 
232 typedef struct {
233   PetscScalar u, v;
234 } Node;
235 
236 typedef struct {
237   PetscScalar b;     /* bed */
238   PetscScalar h;     /* thickness */
239   PetscScalar beta2; /* friction */
240 } PrmNode;
241 
242 typedef struct {
243   PetscReal min, max, cmin, cmax;
244 } PRange;
245 
246 typedef enum {
247   THIASSEMBLY_TRIDIAGONAL,
248   THIASSEMBLY_FULL
249 } THIAssemblyMode;
250 
251 struct _p_THI {
252   PETSCHEADER(int);
253   void (*initialize)(THI, PetscReal x, PetscReal y, PrmNode *p);
254   PetscInt  zlevels;
255   PetscReal Lx, Ly, Lz; /* Model domain */
256   PetscReal alpha;      /* Bed angle */
257   Units     units;
258   PetscReal dirichlet_scale;
259   PetscReal ssa_friction_scale;
260   PRange    eta;
261   PRange    beta2;
262   struct {
263     PetscReal Bd2, eps, exponent;
264   } viscosity;
265   struct {
266     PetscReal irefgam, eps2, exponent, refvel, epsvel;
267   } friction;
268   PetscReal rhog;
269   PetscBool no_slip;
270   PetscBool tridiagonal;
271   PetscBool coarse2d;
272   PetscBool verbose;
273   MatType   mattype;
274 };
275 
276 struct _n_Units {
277   /* fundamental */
278   PetscReal meter;
279   PetscReal kilogram;
280   PetscReal second;
281   /* derived */
282   PetscReal Pascal;
283   PetscReal year;
284 };
285 
286 static PetscErrorCode THIJacobianLocal_3D_Full(DMDALocalInfo *, Node ***, Mat, Mat, THI);
287 static PetscErrorCode THIJacobianLocal_3D_Tridiagonal(DMDALocalInfo *, Node ***, Mat, Mat, THI);
288 static PetscErrorCode THIJacobianLocal_2D(DMDALocalInfo *, Node **, Mat, Mat, THI);
289 
PrmHexGetZ(const PrmNode pn[],PetscInt k,PetscInt zm,PetscReal zn[])290 static void PrmHexGetZ(const PrmNode pn[], PetscInt k, PetscInt zm, PetscReal zn[])
291 {
292   const PetscScalar zm1 = zm - 1, znl[8] = {pn[0].b + pn[0].h * (PetscScalar)k / zm1,       pn[1].b + pn[1].h * (PetscScalar)k / zm1,       pn[2].b + pn[2].h * (PetscScalar)k / zm1,       pn[3].b + pn[3].h * (PetscScalar)k / zm1,
293                                             pn[0].b + pn[0].h * (PetscScalar)(k + 1) / zm1, pn[1].b + pn[1].h * (PetscScalar)(k + 1) / zm1, pn[2].b + pn[2].h * (PetscScalar)(k + 1) / zm1, pn[3].b + pn[3].h * (PetscScalar)(k + 1) / zm1};
294   PetscInt          i;
295   for (i = 0; i < 8; i++) zn[i] = PetscRealPart(znl[i]);
296 }
297 
298 /* Tests A and C are from the ISMIP-HOM paper (Pattyn et al. 2008) */
THIInitialize_HOM_A(THI thi,PetscReal x,PetscReal y,PrmNode * p)299 static void THIInitialize_HOM_A(THI thi, PetscReal x, PetscReal y, PrmNode *p)
300 {
301   Units     units = thi->units;
302   PetscReal s     = -x * PetscSinReal(thi->alpha);
303 
304   p->b     = s - 1000 * units->meter + 500 * units->meter * PetscSinReal(x * 2 * PETSC_PI / thi->Lx) * PetscSinReal(y * 2 * PETSC_PI / thi->Ly);
305   p->h     = s - p->b;
306   p->beta2 = 1e30;
307 }
308 
THIInitialize_HOM_C(THI thi,PetscReal x,PetscReal y,PrmNode * p)309 static void THIInitialize_HOM_C(THI thi, PetscReal x, PetscReal y, PrmNode *p)
310 {
311   Units     units = thi->units;
312   PetscReal s     = -x * PetscSinReal(thi->alpha);
313 
314   p->b = s - 1000 * units->meter;
315   p->h = s - p->b;
316   /* tau_b = beta2 v   is a stress (Pa) */
317   p->beta2 = 1000 * (1 + PetscSinReal(x * 2 * PETSC_PI / thi->Lx) * PetscSinReal(y * 2 * PETSC_PI / thi->Ly)) * units->Pascal * units->year / units->meter;
318 }
319 
320 /* These are just toys */
321 
322 /* Same bed as test A, free slip everywhere except for a discontinuous jump to a circular sticky region in the middle. */
THIInitialize_HOM_X(THI thi,PetscReal xx,PetscReal yy,PrmNode * p)323 static void THIInitialize_HOM_X(THI thi, PetscReal xx, PetscReal yy, PrmNode *p)
324 {
325   Units     units = thi->units;
326   PetscReal x = xx * 2 * PETSC_PI / thi->Lx - PETSC_PI, y = yy * 2 * PETSC_PI / thi->Ly - PETSC_PI; /* [-pi,pi] */
327   PetscReal r = PetscSqrtReal(x * x + y * y), s = -x * PetscSinReal(thi->alpha);
328   p->b     = s - 1000 * units->meter + 500 * units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
329   p->h     = s - p->b;
330   p->beta2 = 1000 * (r < 1 ? 2 : 0) * units->Pascal * units->year / units->meter;
331 }
332 
333 /* Like Z, but with 200 meter cliffs */
THIInitialize_HOM_Y(THI thi,PetscReal xx,PetscReal yy,PrmNode * p)334 static void THIInitialize_HOM_Y(THI thi, PetscReal xx, PetscReal yy, PrmNode *p)
335 {
336   Units     units = thi->units;
337   PetscReal x = xx * 2 * PETSC_PI / thi->Lx - PETSC_PI, y = yy * 2 * PETSC_PI / thi->Ly - PETSC_PI; /* [-pi,pi] */
338   PetscReal r = PetscSqrtReal(x * x + y * y), s = -x * PetscSinReal(thi->alpha);
339 
340   p->b = s - 1000 * units->meter + 500 * units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
341   if (PetscRealPart(p->b) > -700 * units->meter) p->b += 200 * units->meter;
342   p->h     = s - p->b;
343   p->beta2 = 1000 * (1. + PetscSinReal(PetscSqrtReal(16 * r)) / PetscSqrtReal(1e-2 + 16 * r) * PetscCosReal(x * 3 / 2) * PetscCosReal(y * 3 / 2)) * units->Pascal * units->year / units->meter;
344 }
345 
346 /* Same bed as A, smoothly varying slipperiness, similar to MATLAB's "sombrero" (uncorrelated with bathymetry) */
THIInitialize_HOM_Z(THI thi,PetscReal xx,PetscReal yy,PrmNode * p)347 static void THIInitialize_HOM_Z(THI thi, PetscReal xx, PetscReal yy, PrmNode *p)
348 {
349   Units     units = thi->units;
350   PetscReal x = xx * 2 * PETSC_PI / thi->Lx - PETSC_PI, y = yy * 2 * PETSC_PI / thi->Ly - PETSC_PI; /* [-pi,pi] */
351   PetscReal r = PetscSqrtReal(x * x + y * y), s = -x * PetscSinReal(thi->alpha);
352 
353   p->b     = s - 1000 * units->meter + 500 * units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
354   p->h     = s - p->b;
355   p->beta2 = 1000 * (1. + PetscSinReal(PetscSqrtReal(16 * r)) / PetscSqrtReal(1e-2 + 16 * r) * PetscCosReal(x * 3 / 2) * PetscCosReal(y * 3 / 2)) * units->Pascal * units->year / units->meter;
356 }
357 
THIFriction(THI thi,PetscReal rbeta2,PetscReal gam,PetscReal * beta2,PetscReal * dbeta2)358 static void THIFriction(THI thi, PetscReal rbeta2, PetscReal gam, PetscReal *beta2, PetscReal *dbeta2)
359 {
360   if (thi->friction.irefgam == 0) {
361     Units units           = thi->units;
362     thi->friction.irefgam = 1. / (0.5 * PetscSqr(thi->friction.refvel * units->meter / units->year));
363     thi->friction.eps2    = 0.5 * PetscSqr(thi->friction.epsvel * units->meter / units->year) * thi->friction.irefgam;
364   }
365   if (thi->friction.exponent == 0) {
366     *beta2  = rbeta2;
367     *dbeta2 = 0;
368   } else {
369     *beta2  = rbeta2 * PetscPowReal(thi->friction.eps2 + gam * thi->friction.irefgam, thi->friction.exponent);
370     *dbeta2 = thi->friction.exponent * *beta2 / (thi->friction.eps2 + gam * thi->friction.irefgam) * thi->friction.irefgam;
371   }
372 }
373 
THIViscosity(THI thi,PetscReal gam,PetscReal * eta,PetscReal * deta)374 static void THIViscosity(THI thi, PetscReal gam, PetscReal *eta, PetscReal *deta)
375 {
376   PetscReal Bd2, eps, exponent;
377   if (thi->viscosity.Bd2 == 0) {
378     Units           units   = thi->units;
379     const PetscReal n       = 3.,                                                     /* Glen exponent */
380       p                     = 1. + 1. / n,                                            /* for Stokes */
381       A                     = 1.e-16 * PetscPowReal(units->Pascal, -n) / units->year, /* softness parameter (Pa^{-n}/s) */
382       B                     = PetscPowReal(A, -1. / n);                               /* hardness parameter */
383     thi->viscosity.Bd2      = B / 2;
384     thi->viscosity.exponent = (p - 2) / 2;
385     thi->viscosity.eps      = 0.5 * PetscSqr(1e-5 / units->year);
386   }
387   Bd2      = thi->viscosity.Bd2;
388   exponent = thi->viscosity.exponent;
389   eps      = thi->viscosity.eps;
390   *eta     = Bd2 * PetscPowReal(eps + gam, exponent);
391   *deta    = exponent * (*eta) / (eps + gam);
392 }
393 
RangeUpdate(PetscReal * min,PetscReal * max,PetscReal x)394 static void RangeUpdate(PetscReal *min, PetscReal *max, PetscReal x)
395 {
396   if (x < *min) *min = x;
397   if (x > *max) *max = x;
398 }
399 
PRangeClear(PRange * p)400 static void PRangeClear(PRange *p)
401 {
402   p->cmin = p->min = 1e100;
403   p->cmax = p->max = -1e100;
404 }
405 
PRangeMinMax(PRange * p,PetscReal min,PetscReal max)406 static PetscErrorCode PRangeMinMax(PRange *p, PetscReal min, PetscReal max)
407 {
408   PetscFunctionBeginUser;
409   p->cmin = min;
410   p->cmax = max;
411   if (min < p->min) p->min = min;
412   if (max > p->max) p->max = max;
413   PetscFunctionReturn(PETSC_SUCCESS);
414 }
415 
THIDestroy(THI * thi)416 static PetscErrorCode THIDestroy(THI *thi)
417 {
418   PetscFunctionBeginUser;
419   if (!*thi) PetscFunctionReturn(PETSC_SUCCESS);
420   if (--((PetscObject)*thi)->refct > 0) {
421     *thi = 0;
422     PetscFunctionReturn(PETSC_SUCCESS);
423   }
424   PetscCall(PetscFree((*thi)->units));
425   PetscCall(PetscFree((*thi)->mattype));
426   PetscCall(PetscHeaderDestroy(thi));
427   PetscFunctionReturn(PETSC_SUCCESS);
428 }
429 
THICreate(MPI_Comm comm,THI * inthi)430 static PetscErrorCode THICreate(MPI_Comm comm, THI *inthi)
431 {
432   static PetscBool registered = PETSC_FALSE;
433   THI              thi;
434   Units            units;
435 
436   PetscFunctionBeginUser;
437   *inthi = 0;
438   if (!registered) {
439     PetscCall(PetscClassIdRegister("Toy Hydrostatic Ice", &THI_CLASSID));
440     registered = PETSC_TRUE;
441   }
442   PetscCall(PetscHeaderCreate(thi, THI_CLASSID, "THI", "Toy Hydrostatic Ice", "", comm, THIDestroy, 0));
443 
444   PetscCall(PetscNew(&thi->units));
445   units           = thi->units;
446   units->meter    = 1e-2;
447   units->second   = 1e-7;
448   units->kilogram = 1e-12;
449 
450   PetscOptionsBegin(comm, NULL, "Scaled units options", "");
451   {
452     PetscCall(PetscOptionsReal("-units_meter", "1 meter in scaled length units", "", units->meter, &units->meter, NULL));
453     PetscCall(PetscOptionsReal("-units_second", "1 second in scaled time units", "", units->second, &units->second, NULL));
454     PetscCall(PetscOptionsReal("-units_kilogram", "1 kilogram in scaled mass units", "", units->kilogram, &units->kilogram, NULL));
455   }
456   PetscOptionsEnd();
457   units->Pascal = units->kilogram / (units->meter * PetscSqr(units->second));
458   units->year   = 31556926. * units->second; /* seconds per year */
459 
460   thi->Lx              = 10.e3;
461   thi->Ly              = 10.e3;
462   thi->Lz              = 1000;
463   thi->dirichlet_scale = 1;
464   thi->verbose         = PETSC_FALSE;
465 
466   PetscOptionsBegin(comm, NULL, "Toy Hydrostatic Ice options", "");
467   {
468     QuadratureType quad       = QUAD_GAUSS;
469     char           homexp[]   = "A";
470     char           mtype[256] = MATSBAIJ;
471     PetscReal      L, m = 1.0;
472     PetscBool      flg;
473     L = thi->Lx;
474     PetscCall(PetscOptionsReal("-thi_L", "Domain size (m)", "", L, &L, &flg));
475     if (flg) thi->Lx = thi->Ly = L;
476     PetscCall(PetscOptionsReal("-thi_Lx", "X Domain size (m)", "", thi->Lx, &thi->Lx, NULL));
477     PetscCall(PetscOptionsReal("-thi_Ly", "Y Domain size (m)", "", thi->Ly, &thi->Ly, NULL));
478     PetscCall(PetscOptionsReal("-thi_Lz", "Z Domain size (m)", "", thi->Lz, &thi->Lz, NULL));
479     PetscCall(PetscOptionsString("-thi_hom", "ISMIP-HOM experiment (A or C)", "", homexp, homexp, sizeof(homexp), NULL));
480     switch (homexp[0] = (char)toupper(homexp[0])) {
481     case 'A':
482       thi->initialize = THIInitialize_HOM_A;
483       thi->no_slip    = PETSC_TRUE;
484       thi->alpha      = 0.5;
485       break;
486     case 'C':
487       thi->initialize = THIInitialize_HOM_C;
488       thi->no_slip    = PETSC_FALSE;
489       thi->alpha      = 0.1;
490       break;
491     case 'X':
492       thi->initialize = THIInitialize_HOM_X;
493       thi->no_slip    = PETSC_FALSE;
494       thi->alpha      = 0.3;
495       break;
496     case 'Y':
497       thi->initialize = THIInitialize_HOM_Y;
498       thi->no_slip    = PETSC_FALSE;
499       thi->alpha      = 0.5;
500       break;
501     case 'Z':
502       thi->initialize = THIInitialize_HOM_Z;
503       thi->no_slip    = PETSC_FALSE;
504       thi->alpha      = 0.5;
505       break;
506     default:
507       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "HOM experiment '%c' not implemented", homexp[0]);
508     }
509     PetscCall(PetscOptionsEnum("-thi_quadrature", "Quadrature to use for 3D elements", "", QuadratureTypes, (PetscEnum)quad, (PetscEnum *)&quad, NULL));
510     switch (quad) {
511     case QUAD_GAUSS:
512       HexQInterp = HexQInterp_Gauss;
513       HexQDeriv  = HexQDeriv_Gauss;
514       break;
515     case QUAD_LOBATTO:
516       HexQInterp = HexQInterp_Lobatto;
517       HexQDeriv  = HexQDeriv_Lobatto;
518       break;
519     }
520     PetscCall(PetscOptionsReal("-thi_alpha", "Bed angle (degrees)", "", thi->alpha, &thi->alpha, NULL));
521 
522     thi->friction.refvel = 100.;
523     thi->friction.epsvel = 1.;
524 
525     PetscCall(PetscOptionsReal("-thi_friction_refvel", "Reference velocity for sliding", "", thi->friction.refvel, &thi->friction.refvel, NULL));
526     PetscCall(PetscOptionsReal("-thi_friction_epsvel", "Regularization velocity for sliding", "", thi->friction.epsvel, &thi->friction.epsvel, NULL));
527     PetscCall(PetscOptionsReal("-thi_friction_m", "Friction exponent, 0=Coulomb, 1=Navier", "", m, &m, NULL));
528 
529     thi->friction.exponent = (m - 1) / 2;
530 
531     PetscCall(PetscOptionsReal("-thi_dirichlet_scale", "Scale Dirichlet boundary conditions by this factor", "", thi->dirichlet_scale, &thi->dirichlet_scale, NULL));
532     PetscCall(PetscOptionsReal("-thi_ssa_friction_scale", "Scale slip boundary conditions by this factor in SSA (2D) assembly", "", thi->ssa_friction_scale, &thi->ssa_friction_scale, NULL));
533     PetscCall(PetscOptionsBool("-thi_coarse2d", "Use a 2D coarse space corresponding to SSA", "", thi->coarse2d, &thi->coarse2d, NULL));
534     PetscCall(PetscOptionsBool("-thi_tridiagonal", "Assemble a tridiagonal system (column coupling only) on the finest level", "", thi->tridiagonal, &thi->tridiagonal, NULL));
535     PetscCall(PetscOptionsFList("-thi_mat_type", "Matrix type", "MatSetType", MatList, mtype, (char *)mtype, sizeof(mtype), NULL));
536     PetscCall(PetscStrallocpy(mtype, (char **)&thi->mattype));
537     PetscCall(PetscOptionsBool("-thi_verbose", "Enable verbose output (like matrix sizes and statistics)", "", thi->verbose, &thi->verbose, NULL));
538   }
539   PetscOptionsEnd();
540 
541   /* dimensionalize */
542   thi->Lx *= units->meter;
543   thi->Ly *= units->meter;
544   thi->Lz *= units->meter;
545   thi->alpha *= PETSC_PI / 180;
546 
547   PRangeClear(&thi->eta);
548   PRangeClear(&thi->beta2);
549 
550   {
551     PetscReal u = 1000 * units->meter / (3e7 * units->second), gradu = u / (100 * units->meter), eta, deta, rho = 910 * units->kilogram / PetscPowReal(units->meter, 3), grav = 9.81 * units->meter / PetscSqr(units->second),
552               driving = rho * grav * PetscSinReal(thi->alpha) * 1000 * units->meter;
553     THIViscosity(thi, 0.5 * gradu * gradu, &eta, &deta);
554     thi->rhog = rho * grav;
555     if (thi->verbose) {
556       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi), "Units: meter %8.2g  second %8.2g  kg %8.2g  Pa %8.2g\n", (double)units->meter, (double)units->second, (double)units->kilogram, (double)units->Pascal));
557       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi), "Domain (%6.2g,%6.2g,%6.2g), pressure %8.2g, driving stress %8.2g\n", (double)thi->Lx, (double)thi->Ly, (double)thi->Lz, (double)(rho * grav * 1e3 * units->meter), (double)driving));
558       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi), "Large velocity 1km/a %8.2g, velocity gradient %8.2g, eta %8.2g, stress %8.2g, ratio %8.2g\n", (double)u, (double)gradu, (double)eta, (double)(2 * eta * gradu), (double)(2 * eta * gradu / driving)));
559       THIViscosity(thi, 0.5 * PetscSqr(1e-3 * gradu), &eta, &deta);
560       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi), "Small velocity 1m/a  %8.2g, velocity gradient %8.2g, eta %8.2g, stress %8.2g, ratio %8.2g\n", (double)(1e-3 * u), (double)(1e-3 * gradu), (double)eta, (double)(2 * eta * 1e-3 * gradu), (double)(2 * eta * 1e-3 * gradu / driving)));
561     }
562   }
563 
564   *inthi = thi;
565   PetscFunctionReturn(PETSC_SUCCESS);
566 }
567 
THIInitializePrm(THI thi,DM da2prm,Vec prm)568 static PetscErrorCode THIInitializePrm(THI thi, DM da2prm, Vec prm)
569 {
570   PrmNode **p;
571   PetscInt  i, j, xs, xm, ys, ym, mx, my;
572 
573   PetscFunctionBeginUser;
574   PetscCall(DMDAGetGhostCorners(da2prm, &ys, &xs, 0, &ym, &xm, 0));
575   PetscCall(DMDAGetInfo(da2prm, 0, &my, &mx, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
576   PetscCall(DMDAVecGetArray(da2prm, prm, &p));
577   for (i = xs; i < xs + xm; i++) {
578     for (j = ys; j < ys + ym; j++) {
579       PetscReal xx = thi->Lx * i / mx, yy = thi->Ly * j / my;
580       thi->initialize(thi, xx, yy, &p[i][j]);
581     }
582   }
583   PetscCall(DMDAVecRestoreArray(da2prm, prm, &p));
584   PetscFunctionReturn(PETSC_SUCCESS);
585 }
586 
THISetUpDM(THI thi,DM dm)587 static PetscErrorCode THISetUpDM(THI thi, DM dm)
588 {
589   PetscInt        refinelevel, coarsenlevel, level, dim, Mx, My, Mz, mx, my, s;
590   DMDAStencilType st;
591   DM              da2prm;
592   Vec             X;
593 
594   PetscFunctionBeginUser;
595   PetscCall(DMDAGetInfo(dm, &dim, &Mz, &My, &Mx, 0, &my, &mx, 0, &s, 0, 0, 0, &st));
596   if (dim == 2) PetscCall(DMDAGetInfo(dm, &dim, &My, &Mx, 0, &my, &mx, 0, 0, &s, 0, 0, 0, &st));
597   PetscCall(DMGetRefineLevel(dm, &refinelevel));
598   PetscCall(DMGetCoarsenLevel(dm, &coarsenlevel));
599   level = refinelevel - coarsenlevel;
600   PetscCall(DMDACreate2d(PetscObjectComm((PetscObject)thi), DM_BOUNDARY_PERIODIC, DM_BOUNDARY_PERIODIC, st, My, Mx, my, mx, sizeof(PrmNode) / sizeof(PetscScalar), s, 0, 0, &da2prm));
601   PetscCall(DMSetUp(da2prm));
602   PetscCall(DMCreateLocalVector(da2prm, &X));
603   {
604     PetscReal Lx = thi->Lx / thi->units->meter, Ly = thi->Ly / thi->units->meter, Lz = thi->Lz / thi->units->meter;
605     if (dim == 2) {
606       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi), "Level %" PetscInt_FMT " domain size (m) %8.2g x %8.2g, num elements %" PetscInt_FMT " x %" PetscInt_FMT " (%" PetscInt_FMT "), size (m) %g x %g\n", level, (double)Lx, (double)Ly, Mx, My, Mx * My, (double)(Lx / Mx), (double)(Ly / My)));
607     } else {
608       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi), "Level %" PetscInt_FMT " domain size (m) %8.2g x %8.2g x %8.2g, num elements %" PetscInt_FMT " x %" PetscInt_FMT " x %" PetscInt_FMT " (%" PetscInt_FMT "), size (m) %g x %g x %g\n", level, (double)Lx, (double)Ly, (double)Lz, Mx, My, Mz, Mx * My * Mz, (double)(Lx / Mx), (double)(Ly / My), (double)(1000. / (Mz - 1))));
609     }
610   }
611   PetscCall(THIInitializePrm(thi, da2prm, X));
612   if (thi->tridiagonal) { /* Reset coarse Jacobian evaluation */
613     PetscCall(DMDASNESSetJacobianLocal(dm, (DMDASNESJacobianFn *)THIJacobianLocal_3D_Full, thi));
614   }
615   if (thi->coarse2d) PetscCall(DMDASNESSetJacobianLocal(dm, (DMDASNESJacobianFn *)THIJacobianLocal_2D, thi));
616   PetscCall(PetscObjectCompose((PetscObject)dm, "DMDA2Prm", (PetscObject)da2prm));
617   PetscCall(PetscObjectCompose((PetscObject)dm, "DMDA2Prm_Vec", (PetscObject)X));
618   PetscCall(DMDestroy(&da2prm));
619   PetscCall(VecDestroy(&X));
620   PetscFunctionReturn(PETSC_SUCCESS);
621 }
622 
DMCoarsenHook_THI(DM dmf,DM dmc,PetscCtx ctx)623 static PetscErrorCode DMCoarsenHook_THI(DM dmf, DM dmc, PetscCtx ctx)
624 {
625   THI      thi = (THI)ctx;
626   PetscInt rlevel, clevel;
627 
628   PetscFunctionBeginUser;
629   PetscCall(THISetUpDM(thi, dmc));
630   PetscCall(DMGetRefineLevel(dmc, &rlevel));
631   PetscCall(DMGetCoarsenLevel(dmc, &clevel));
632   if (rlevel - clevel == 0) PetscCall(DMSetMatType(dmc, MATAIJ));
633   PetscCall(DMCoarsenHookAdd(dmc, DMCoarsenHook_THI, NULL, thi));
634   PetscFunctionReturn(PETSC_SUCCESS);
635 }
636 
DMRefineHook_THI(DM dmc,DM dmf,PetscCtx ctx)637 static PetscErrorCode DMRefineHook_THI(DM dmc, DM dmf, PetscCtx ctx)
638 {
639   THI thi = (THI)ctx;
640 
641   PetscFunctionBeginUser;
642   PetscCall(THISetUpDM(thi, dmf));
643   PetscCall(DMSetMatType(dmf, thi->mattype));
644   PetscCall(DMRefineHookAdd(dmf, DMRefineHook_THI, NULL, thi));
645   /* With grid sequencing, a formerly-refined DM will later be coarsened by PCSetUp_MG */
646   PetscCall(DMCoarsenHookAdd(dmf, DMCoarsenHook_THI, NULL, thi));
647   PetscFunctionReturn(PETSC_SUCCESS);
648 }
649 
THIDAGetPrm(DM da,PrmNode *** prm)650 static PetscErrorCode THIDAGetPrm(DM da, PrmNode ***prm)
651 {
652   DM  da2prm;
653   Vec X;
654 
655   PetscFunctionBeginUser;
656   PetscCall(PetscObjectQuery((PetscObject)da, "DMDA2Prm", (PetscObject *)&da2prm));
657   PetscCheck(da2prm, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "No DMDA2Prm composed with given DMDA");
658   PetscCall(PetscObjectQuery((PetscObject)da, "DMDA2Prm_Vec", (PetscObject *)&X));
659   PetscCheck(X, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "No DMDA2Prm_Vec composed with given DMDA");
660   PetscCall(DMDAVecGetArray(da2prm, X, prm));
661   PetscFunctionReturn(PETSC_SUCCESS);
662 }
663 
THIDARestorePrm(DM da,PrmNode *** prm)664 static PetscErrorCode THIDARestorePrm(DM da, PrmNode ***prm)
665 {
666   DM  da2prm;
667   Vec X;
668 
669   PetscFunctionBeginUser;
670   PetscCall(PetscObjectQuery((PetscObject)da, "DMDA2Prm", (PetscObject *)&da2prm));
671   PetscCheck(da2prm, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "No DMDA2Prm composed with given DMDA");
672   PetscCall(PetscObjectQuery((PetscObject)da, "DMDA2Prm_Vec", (PetscObject *)&X));
673   PetscCheck(X, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "No DMDA2Prm_Vec composed with given DMDA");
674   PetscCall(DMDAVecRestoreArray(da2prm, X, prm));
675   PetscFunctionReturn(PETSC_SUCCESS);
676 }
677 
THIInitial(SNES snes,Vec X,PetscCtx ctx)678 static PetscErrorCode THIInitial(SNES snes, Vec X, PetscCtx ctx)
679 {
680   THI       thi;
681   PetscInt  i, j, k, xs, xm, ys, ym, zs, zm, mx, my;
682   PetscReal hx, hy;
683   PrmNode **prm;
684   Node   ***x;
685   DM        da;
686 
687   PetscFunctionBeginUser;
688   PetscCall(SNESGetDM(snes, &da));
689   PetscCall(DMGetApplicationContext(da, &thi));
690   PetscCall(DMDAGetInfo(da, 0, 0, &my, &mx, 0, 0, 0, 0, 0, 0, 0, 0, 0));
691   PetscCall(DMDAGetCorners(da, &zs, &ys, &xs, &zm, &ym, &xm));
692   PetscCall(DMDAVecGetArray(da, X, &x));
693   PetscCall(THIDAGetPrm(da, &prm));
694   hx = thi->Lx / mx;
695   hy = thi->Ly / my;
696   for (i = xs; i < xs + xm; i++) {
697     for (j = ys; j < ys + ym; j++) {
698       for (k = zs; k < zs + zm; k++) {
699         const PetscScalar zm1 = zm - 1, drivingx = thi->rhog * (prm[i + 1][j].b + prm[i + 1][j].h - prm[i - 1][j].b - prm[i - 1][j].h) / (2 * hx), drivingy = thi->rhog * (prm[i][j + 1].b + prm[i][j + 1].h - prm[i][j - 1].b - prm[i][j - 1].h) / (2 * hy);
700         x[i][j][k].u = 0. * drivingx * prm[i][j].h * (PetscScalar)k / zm1;
701         x[i][j][k].v = 0. * drivingy * prm[i][j].h * (PetscScalar)k / zm1;
702       }
703     }
704   }
705   PetscCall(DMDAVecRestoreArray(da, X, &x));
706   PetscCall(THIDARestorePrm(da, &prm));
707   PetscFunctionReturn(PETSC_SUCCESS);
708 }
709 
PointwiseNonlinearity(THI thi,const Node n[PETSC_RESTRICT],const PetscReal phi[PETSC_RESTRICT],PetscReal dphi[PETSC_RESTRICT][3],PetscScalar * PETSC_RESTRICT u,PetscScalar * PETSC_RESTRICT v,PetscScalar du[PETSC_RESTRICT],PetscScalar dv[PETSC_RESTRICT],PetscReal * eta,PetscReal * deta)710 static void PointwiseNonlinearity(THI thi, const Node n[PETSC_RESTRICT], const PetscReal phi[PETSC_RESTRICT], PetscReal dphi[PETSC_RESTRICT][3], PetscScalar *PETSC_RESTRICT u, PetscScalar *PETSC_RESTRICT v, PetscScalar du[PETSC_RESTRICT], PetscScalar dv[PETSC_RESTRICT], PetscReal *eta, PetscReal *deta)
711 {
712   PetscInt    l, ll;
713   PetscScalar gam;
714 
715   du[0] = du[1] = du[2] = 0;
716   dv[0] = dv[1] = dv[2] = 0;
717   *u                    = 0;
718   *v                    = 0;
719   for (l = 0; l < 8; l++) {
720     *u += phi[l] * n[l].u;
721     *v += phi[l] * n[l].v;
722     for (ll = 0; ll < 3; ll++) {
723       du[ll] += dphi[l][ll] * n[l].u;
724       dv[ll] += dphi[l][ll] * n[l].v;
725     }
726   }
727   gam = PetscSqr(du[0]) + PetscSqr(dv[1]) + du[0] * dv[1] + 0.25 * PetscSqr(du[1] + dv[0]) + 0.25 * PetscSqr(du[2]) + 0.25 * PetscSqr(dv[2]);
728   THIViscosity(thi, PetscRealPart(gam), eta, deta);
729 }
730 
PointwiseNonlinearity2D(THI thi,Node n[],PetscReal phi[],PetscReal dphi[4][2],PetscScalar * u,PetscScalar * v,PetscScalar du[],PetscScalar dv[],PetscReal * eta,PetscReal * deta)731 static void PointwiseNonlinearity2D(THI thi, Node n[], PetscReal phi[], PetscReal dphi[4][2], PetscScalar *u, PetscScalar *v, PetscScalar du[], PetscScalar dv[], PetscReal *eta, PetscReal *deta)
732 {
733   PetscInt    l, ll;
734   PetscScalar gam;
735 
736   du[0] = du[1] = 0;
737   dv[0] = dv[1] = 0;
738   *u            = 0;
739   *v            = 0;
740   for (l = 0; l < 4; l++) {
741     *u += phi[l] * n[l].u;
742     *v += phi[l] * n[l].v;
743     for (ll = 0; ll < 2; ll++) {
744       du[ll] += dphi[l][ll] * n[l].u;
745       dv[ll] += dphi[l][ll] * n[l].v;
746     }
747   }
748   gam = PetscSqr(du[0]) + PetscSqr(dv[1]) + du[0] * dv[1] + 0.25 * PetscSqr(du[1] + dv[0]);
749   THIViscosity(thi, PetscRealPart(gam), eta, deta);
750 }
751 
THIFunctionLocal(DMDALocalInfo * info,Node *** x,Node *** f,THI thi)752 static PetscErrorCode THIFunctionLocal(DMDALocalInfo *info, Node ***x, Node ***f, THI thi)
753 {
754   PetscInt  xs, ys, xm, ym, zm, i, j, k, q, l;
755   PetscReal hx, hy, etamin, etamax, beta2min, beta2max;
756   PrmNode **prm;
757 
758   PetscFunctionBeginUser;
759   xs = info->zs;
760   ys = info->ys;
761   xm = info->zm;
762   ym = info->ym;
763   zm = info->xm;
764   hx = thi->Lx / info->mz;
765   hy = thi->Ly / info->my;
766 
767   etamin   = 1e100;
768   etamax   = 0;
769   beta2min = 1e100;
770   beta2max = 0;
771 
772   PetscCall(THIDAGetPrm(info->da, &prm));
773 
774   for (i = xs; i < xs + xm; i++) {
775     for (j = ys; j < ys + ym; j++) {
776       PrmNode pn[4];
777       QuadExtract(prm, i, j, pn);
778       for (k = 0; k < zm - 1; k++) {
779         PetscInt  ls = 0;
780         Node      n[8], *fn[8];
781         PetscReal zn[8], etabase = 0;
782         PrmHexGetZ(pn, k, zm, zn);
783         HexExtract(x, i, j, k, n);
784         HexExtractRef(f, i, j, k, fn);
785         if (thi->no_slip && k == 0) {
786           for (l = 0; l < 4; l++) n[l].u = n[l].v = 0;
787           /* The first 4 basis functions lie on the bottom layer, so their contribution is exactly 0, hence we can skip them */
788           ls = 4;
789         }
790         for (q = 0; q < 8; q++) {
791           PetscReal   dz[3], phi[8], dphi[8][3], jw, eta, deta;
792           PetscScalar du[3], dv[3], u, v;
793           HexGrad(HexQDeriv[q], zn, dz);
794           HexComputeGeometry(q, hx, hy, dz, phi, dphi, &jw);
795           PointwiseNonlinearity(thi, n, phi, dphi, &u, &v, du, dv, &eta, &deta);
796           jw /= thi->rhog; /* scales residuals to be O(1) */
797           if (q == 0) etabase = eta;
798           RangeUpdate(&etamin, &etamax, eta);
799           for (l = ls; l < 8; l++) { /* test functions */
800             const PetscReal ds[2] = {-PetscSinReal(thi->alpha), 0};
801             const PetscReal pp = phi[l], *dp = dphi[l];
802             fn[l]->u += dp[0] * jw * eta * (4. * du[0] + 2. * dv[1]) + dp[1] * jw * eta * (du[1] + dv[0]) + dp[2] * jw * eta * du[2] + pp * jw * thi->rhog * ds[0];
803             fn[l]->v += dp[1] * jw * eta * (2. * du[0] + 4. * dv[1]) + dp[0] * jw * eta * (du[1] + dv[0]) + dp[2] * jw * eta * dv[2] + pp * jw * thi->rhog * ds[1];
804           }
805         }
806         if (k == 0) { /* we are on a bottom face */
807           if (thi->no_slip) {
808             /* Note: Non-Galerkin coarse grid operators are very sensitive to the scaling of Dirichlet boundary
809             * conditions.  After shenanigans above, etabase contains the effective viscosity at the closest quadrature
810             * point to the bed.  We want the diagonal entry in the Dirichlet condition to have similar magnitude to the
811             * diagonal entry corresponding to the adjacent node.  The fundamental scaling of the viscous part is in
812             * diagu, diagv below.  This scaling is easy to recognize by considering the finite difference operator after
813             * scaling by element size.  The no-slip Dirichlet condition is scaled by this factor, and also in the
814             * assembled matrix (see the similar block in THIJacobianLocal).
815             *
816             * Note that the residual at this Dirichlet node is linear in the state at this node, but also depends
817             * (nonlinearly in general) on the neighboring interior nodes through the local viscosity.  This will make
818             * a matrix-free Jacobian have extra entries in the corresponding row.  We assemble only the diagonal part,
819             * so the solution will exactly satisfy the boundary condition after the first linear iteration.
820             */
821             const PetscReal   hz    = PetscRealPart(pn[0].h) / (zm - 1.);
822             const PetscScalar diagu = 2 * etabase / thi->rhog * (hx * hy / hz + hx * hz / hy + 4 * hy * hz / hx), diagv = 2 * etabase / thi->rhog * (hx * hy / hz + 4 * hx * hz / hy + hy * hz / hx);
823             fn[0]->u = thi->dirichlet_scale * diagu * x[i][j][k].u;
824             fn[0]->v = thi->dirichlet_scale * diagv * x[i][j][k].v;
825           } else { /* Integrate over bottom face to apply boundary condition */
826             for (q = 0; q < 4; q++) {
827               const PetscReal jw = 0.25 * hx * hy / thi->rhog, *phi = QuadQInterp[q];
828               PetscScalar     u = 0, v = 0, rbeta2 = 0;
829               PetscReal       beta2, dbeta2;
830               for (l = 0; l < 4; l++) {
831                 u += phi[l] * n[l].u;
832                 v += phi[l] * n[l].v;
833                 rbeta2 += phi[l] * pn[l].beta2;
834               }
835               THIFriction(thi, PetscRealPart(rbeta2), PetscRealPart(u * u + v * v) / 2, &beta2, &dbeta2);
836               RangeUpdate(&beta2min, &beta2max, beta2);
837               for (l = 0; l < 4; l++) {
838                 const PetscReal pp = phi[l];
839                 fn[ls + l]->u += pp * jw * beta2 * u;
840                 fn[ls + l]->v += pp * jw * beta2 * v;
841               }
842             }
843           }
844         }
845       }
846     }
847   }
848 
849   PetscCall(THIDARestorePrm(info->da, &prm));
850 
851   PetscCall(PRangeMinMax(&thi->eta, etamin, etamax));
852   PetscCall(PRangeMinMax(&thi->beta2, beta2min, beta2max));
853   PetscFunctionReturn(PETSC_SUCCESS);
854 }
855 
THIMatrixStatistics(THI thi,Mat B,PetscViewer viewer)856 static PetscErrorCode THIMatrixStatistics(THI thi, Mat B, PetscViewer viewer)
857 {
858   PetscReal   nrm;
859   PetscInt    m;
860   PetscMPIInt rank;
861 
862   PetscFunctionBeginUser;
863   PetscCall(MatNorm(B, NORM_FROBENIUS, &nrm));
864   PetscCall(MatGetSize(B, &m, 0));
865   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B), &rank));
866   if (rank == 0) {
867     PetscScalar val0, val2;
868     PetscCall(MatGetValue(B, 0, 0, &val0));
869     PetscCall(MatGetValue(B, 2, 2, &val2));
870     PetscCall(PetscViewerASCIIPrintf(viewer, "Matrix dim %" PetscInt_FMT " norm %8.2e (0,0) %8.2e  (2,2) %8.2e %8.2e <= eta <= %8.2e %8.2e <= beta2 <= %8.2e\n", m, (double)nrm, (double)PetscRealPart(val0), (double)PetscRealPart(val2),
871                                      (double)thi->eta.cmin, (double)thi->eta.cmax, (double)thi->beta2.cmin, (double)thi->beta2.cmax));
872   }
873   PetscFunctionReturn(PETSC_SUCCESS);
874 }
875 
THISurfaceStatistics(DM da,Vec X,PetscReal * min,PetscReal * max,PetscReal * mean)876 static PetscErrorCode THISurfaceStatistics(DM da, Vec X, PetscReal *min, PetscReal *max, PetscReal *mean)
877 {
878   Node     ***x;
879   PetscInt    i, j, xs, ys, zs, xm, ym, zm, mx, my, mz;
880   PetscReal   umin = 1e100, umax = -1e100;
881   PetscScalar usum = 0.0, gusum;
882 
883   PetscFunctionBeginUser;
884   *min = *max = *mean = 0;
885   PetscCall(DMDAGetInfo(da, 0, &mz, &my, &mx, 0, 0, 0, 0, 0, 0, 0, 0, 0));
886   PetscCall(DMDAGetCorners(da, &zs, &ys, &xs, &zm, &ym, &xm));
887   PetscCheck(zs == 0 && zm == mz, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected decomposition");
888   PetscCall(DMDAVecGetArray(da, X, &x));
889   for (i = xs; i < xs + xm; i++) {
890     for (j = ys; j < ys + ym; j++) {
891       PetscReal u = PetscRealPart(x[i][j][zm - 1].u);
892       RangeUpdate(&umin, &umax, u);
893       usum += u;
894     }
895   }
896   PetscCall(DMDAVecRestoreArray(da, X, &x));
897   PetscCallMPI(MPIU_Allreduce(&umin, min, 1, MPIU_REAL, MPIU_MIN, PetscObjectComm((PetscObject)da)));
898   PetscCallMPI(MPIU_Allreduce(&umax, max, 1, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)da)));
899   PetscCallMPI(MPIU_Allreduce(&usum, &gusum, 1, MPIU_SCALAR, MPIU_SUM, PetscObjectComm((PetscObject)da)));
900   *mean = PetscRealPart(gusum) / (mx * my);
901   PetscFunctionReturn(PETSC_SUCCESS);
902 }
903 
THISolveStatistics(THI thi,SNES snes,PetscInt coarsened,const char name[])904 static PetscErrorCode THISolveStatistics(THI thi, SNES snes, PetscInt coarsened, const char name[])
905 {
906   MPI_Comm comm;
907   Vec      X;
908   DM       dm;
909 
910   PetscFunctionBeginUser;
911   PetscCall(PetscObjectGetComm((PetscObject)thi, &comm));
912   PetscCall(SNESGetSolution(snes, &X));
913   PetscCall(SNESGetDM(snes, &dm));
914   PetscCall(PetscPrintf(comm, "Solution statistics after solve: %s\n", name));
915   {
916     PetscInt            its, lits;
917     SNESConvergedReason reason;
918     PetscCall(SNESGetIterationNumber(snes, &its));
919     PetscCall(SNESGetConvergedReason(snes, &reason));
920     PetscCall(SNESGetLinearSolveIterations(snes, &lits));
921     PetscCall(PetscPrintf(comm, "%s: Number of SNES iterations = %" PetscInt_FMT ", total linear iterations = %" PetscInt_FMT "\n", SNESConvergedReasons[reason], its, lits));
922   }
923   {
924     PetscReal          nrm2, tmin[3] = {1e100, 1e100, 1e100}, tmax[3] = {-1e100, -1e100, -1e100}, min[3], max[3];
925     PetscInt           i, j, m;
926     const PetscScalar *x;
927     PetscCall(VecNorm(X, NORM_2, &nrm2));
928     PetscCall(VecGetLocalSize(X, &m));
929     PetscCall(VecGetArrayRead(X, &x));
930     for (i = 0; i < m; i += 2) {
931       PetscReal u = PetscRealPart(x[i]), v = PetscRealPart(x[i + 1]), c = PetscSqrtReal(u * u + v * v);
932       tmin[0] = PetscMin(u, tmin[0]);
933       tmin[1] = PetscMin(v, tmin[1]);
934       tmin[2] = PetscMin(c, tmin[2]);
935       tmax[0] = PetscMax(u, tmax[0]);
936       tmax[1] = PetscMax(v, tmax[1]);
937       tmax[2] = PetscMax(c, tmax[2]);
938     }
939     PetscCall(VecRestoreArrayRead(X, &x));
940     PetscCallMPI(MPIU_Allreduce(tmin, min, 3, MPIU_REAL, MPIU_MIN, PetscObjectComm((PetscObject)thi)));
941     PetscCallMPI(MPIU_Allreduce(tmax, max, 3, MPIU_REAL, MPIU_MAX, PetscObjectComm((PetscObject)thi)));
942     /* Dimensionalize to meters/year */
943     nrm2 *= thi->units->year / thi->units->meter;
944     for (j = 0; j < 3; j++) {
945       min[j] *= thi->units->year / thi->units->meter;
946       max[j] *= thi->units->year / thi->units->meter;
947     }
948     if (min[0] == 0.0) min[0] = 0.0;
949     PetscCall(PetscPrintf(comm, "|X|_2 %g   %g <= u <=  %g   %g <= v <=  %g   %g <= c <=  %g \n", (double)nrm2, (double)min[0], (double)max[0], (double)min[1], (double)max[1], (double)min[2], (double)max[2]));
950     {
951       PetscReal umin, umax, umean;
952       PetscCall(THISurfaceStatistics(dm, X, &umin, &umax, &umean));
953       umin *= thi->units->year / thi->units->meter;
954       umax *= thi->units->year / thi->units->meter;
955       umean *= thi->units->year / thi->units->meter;
956       PetscCall(PetscPrintf(comm, "Surface statistics: u in [%12.6e, %12.6e] mean %12.6e\n", (double)umin, (double)umax, (double)umean));
957     }
958     /* These values stay nondimensional */
959     PetscCall(PetscPrintf(comm, "Global eta range   %g to %g converged range %g to %g\n", (double)thi->eta.min, (double)thi->eta.max, (double)thi->eta.cmin, (double)thi->eta.cmax));
960     PetscCall(PetscPrintf(comm, "Global beta2 range %g to %g converged range %g to %g\n", (double)thi->beta2.min, (double)thi->beta2.max, (double)thi->beta2.cmin, (double)thi->beta2.cmax));
961   }
962   PetscFunctionReturn(PETSC_SUCCESS);
963 }
964 
THIJacobianLocal_2D(DMDALocalInfo * info,Node ** x,Mat J,Mat B,THI thi)965 static PetscErrorCode THIJacobianLocal_2D(DMDALocalInfo *info, Node **x, Mat J, Mat B, THI thi)
966 {
967   PetscInt  xs, ys, xm, ym, i, j, q, l, ll;
968   PetscReal hx, hy;
969   PrmNode **prm;
970 
971   PetscFunctionBeginUser;
972   xs = info->ys;
973   ys = info->xs;
974   xm = info->ym;
975   ym = info->xm;
976   hx = thi->Lx / info->my;
977   hy = thi->Ly / info->mx;
978 
979   PetscCall(MatZeroEntries(B));
980   PetscCall(THIDAGetPrm(info->da, &prm));
981 
982   for (i = xs; i < xs + xm; i++) {
983     for (j = ys; j < ys + ym; j++) {
984       Node        n[4];
985       PrmNode     pn[4];
986       PetscScalar Ke[4 * 2][4 * 2];
987       QuadExtract(prm, i, j, pn);
988       QuadExtract(x, i, j, n);
989       PetscCall(PetscMemzero(Ke, sizeof(Ke)));
990       for (q = 0; q < 4; q++) {
991         PetscReal   phi[4], dphi[4][2], jw, eta, deta, beta2, dbeta2;
992         PetscScalar u, v, du[2], dv[2], h = 0, rbeta2 = 0;
993         for (l = 0; l < 4; l++) {
994           phi[l]     = QuadQInterp[q][l];
995           dphi[l][0] = QuadQDeriv[q][l][0] * 2. / hx;
996           dphi[l][1] = QuadQDeriv[q][l][1] * 2. / hy;
997           h += phi[l] * pn[l].h;
998           rbeta2 += phi[l] * pn[l].beta2;
999         }
1000         jw = 0.25 * hx * hy / thi->rhog; /* rhog is only scaling */
1001         PointwiseNonlinearity2D(thi, n, phi, dphi, &u, &v, du, dv, &eta, &deta);
1002         THIFriction(thi, PetscRealPart(rbeta2), PetscRealPart(u * u + v * v) / 2, &beta2, &dbeta2);
1003         for (l = 0; l < 4; l++) {
1004           const PetscReal pp = phi[l], *dp = dphi[l];
1005           for (ll = 0; ll < 4; ll++) {
1006             const PetscReal ppl = phi[ll], *dpl = dphi[ll];
1007             PetscScalar     dgdu, dgdv;
1008             dgdu = 2. * du[0] * dpl[0] + dv[1] * dpl[0] + 0.5 * (du[1] + dv[0]) * dpl[1];
1009             dgdv = 2. * dv[1] * dpl[1] + du[0] * dpl[1] + 0.5 * (du[1] + dv[0]) * dpl[0];
1010             /* Picard part */
1011             Ke[l * 2 + 0][ll * 2 + 0] += dp[0] * jw * eta * 4. * dpl[0] + dp[1] * jw * eta * dpl[1] + pp * jw * (beta2 / h) * ppl * thi->ssa_friction_scale;
1012             Ke[l * 2 + 0][ll * 2 + 1] += dp[0] * jw * eta * 2. * dpl[1] + dp[1] * jw * eta * dpl[0];
1013             Ke[l * 2 + 1][ll * 2 + 0] += dp[1] * jw * eta * 2. * dpl[0] + dp[0] * jw * eta * dpl[1];
1014             Ke[l * 2 + 1][ll * 2 + 1] += dp[1] * jw * eta * 4. * dpl[1] + dp[0] * jw * eta * dpl[0] + pp * jw * (beta2 / h) * ppl * thi->ssa_friction_scale;
1015             /* extra Newton terms */
1016             Ke[l * 2 + 0][ll * 2 + 0] += dp[0] * jw * deta * dgdu * (4. * du[0] + 2. * dv[1]) + dp[1] * jw * deta * dgdu * (du[1] + dv[0]) + pp * jw * (dbeta2 / h) * u * u * ppl * thi->ssa_friction_scale;
1017             Ke[l * 2 + 0][ll * 2 + 1] += dp[0] * jw * deta * dgdv * (4. * du[0] + 2. * dv[1]) + dp[1] * jw * deta * dgdv * (du[1] + dv[0]) + pp * jw * (dbeta2 / h) * u * v * ppl * thi->ssa_friction_scale;
1018             Ke[l * 2 + 1][ll * 2 + 0] += dp[1] * jw * deta * dgdu * (4. * dv[1] + 2. * du[0]) + dp[0] * jw * deta * dgdu * (du[1] + dv[0]) + pp * jw * (dbeta2 / h) * v * u * ppl * thi->ssa_friction_scale;
1019             Ke[l * 2 + 1][ll * 2 + 1] += dp[1] * jw * deta * dgdv * (4. * dv[1] + 2. * du[0]) + dp[0] * jw * deta * dgdv * (du[1] + dv[0]) + pp * jw * (dbeta2 / h) * v * v * ppl * thi->ssa_friction_scale;
1020           }
1021         }
1022       }
1023       {
1024         const MatStencil rc[4] = {
1025           {0, i,     j,     0},
1026           {0, i + 1, j,     0},
1027           {0, i + 1, j + 1, 0},
1028           {0, i,     j + 1, 0}
1029         };
1030         PetscCall(MatSetValuesBlockedStencil(B, 4, rc, 4, rc, &Ke[0][0], ADD_VALUES));
1031       }
1032     }
1033   }
1034   PetscCall(THIDARestorePrm(info->da, &prm));
1035 
1036   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1037   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1038   PetscCall(MatSetOption(B, MAT_SYMMETRIC, PETSC_TRUE));
1039   if (thi->verbose) PetscCall(THIMatrixStatistics(thi, B, PETSC_VIEWER_STDOUT_WORLD));
1040   PetscFunctionReturn(PETSC_SUCCESS);
1041 }
1042 
THIJacobianLocal_3D(DMDALocalInfo * info,Node *** x,Mat B,THI thi,THIAssemblyMode amode)1043 static PetscErrorCode THIJacobianLocal_3D(DMDALocalInfo *info, Node ***x, Mat B, THI thi, THIAssemblyMode amode)
1044 {
1045   PetscInt  xs, ys, xm, ym, zm, i, j, k, q, l, ll;
1046   PetscReal hx, hy;
1047   PrmNode **prm;
1048 
1049   PetscFunctionBeginUser;
1050   xs = info->zs;
1051   ys = info->ys;
1052   xm = info->zm;
1053   ym = info->ym;
1054   zm = info->xm;
1055   hx = thi->Lx / info->mz;
1056   hy = thi->Ly / info->my;
1057 
1058   PetscCall(MatZeroEntries(B));
1059   PetscCall(MatSetOption(B, MAT_SUBSET_OFF_PROC_ENTRIES, PETSC_TRUE));
1060   PetscCall(THIDAGetPrm(info->da, &prm));
1061 
1062   for (i = xs; i < xs + xm; i++) {
1063     for (j = ys; j < ys + ym; j++) {
1064       PrmNode pn[4];
1065       QuadExtract(prm, i, j, pn);
1066       for (k = 0; k < zm - 1; k++) {
1067         Node        n[8];
1068         PetscReal   zn[8], etabase = 0;
1069         PetscScalar Ke[8 * 2][8 * 2];
1070         PetscInt    ls = 0;
1071 
1072         PrmHexGetZ(pn, k, zm, zn);
1073         HexExtract(x, i, j, k, n);
1074         PetscCall(PetscMemzero(Ke, sizeof(Ke)));
1075         if (thi->no_slip && k == 0) {
1076           for (l = 0; l < 4; l++) n[l].u = n[l].v = 0;
1077           ls = 4;
1078         }
1079         for (q = 0; q < 8; q++) {
1080           PetscReal   dz[3], phi[8], dphi[8][3], jw, eta, deta;
1081           PetscScalar du[3], dv[3], u, v;
1082           HexGrad(HexQDeriv[q], zn, dz);
1083           HexComputeGeometry(q, hx, hy, dz, phi, dphi, &jw);
1084           PointwiseNonlinearity(thi, n, phi, dphi, &u, &v, du, dv, &eta, &deta);
1085           jw /= thi->rhog; /* residuals are scaled by this factor */
1086           if (q == 0) etabase = eta;
1087           for (l = ls; l < 8; l++) { /* test functions */
1088             const PetscReal *PETSC_RESTRICT dp = dphi[l];
1089 #if USE_SSE2_KERNELS
1090             /* gcc (up to my 4.5 snapshot) is really bad at hoisting intrinsics so we do it manually */
1091             __m128d p4 = _mm_set1_pd(4), p2 = _mm_set1_pd(2), p05 = _mm_set1_pd(0.5), p42 = _mm_setr_pd(4, 2), p24 = _mm_shuffle_pd(p42, p42, _MM_SHUFFLE2(0, 1)), du0 = _mm_set1_pd(du[0]), du1 = _mm_set1_pd(du[1]), du2 = _mm_set1_pd(du[2]), dv0 = _mm_set1_pd(dv[0]), dv1 = _mm_set1_pd(dv[1]), dv2 = _mm_set1_pd(dv[2]), jweta = _mm_set1_pd(jw * eta), jwdeta = _mm_set1_pd(jw * deta), dp0 = _mm_set1_pd(dp[0]), dp1 = _mm_set1_pd(dp[1]), dp2 = _mm_set1_pd(dp[2]), dp0jweta = _mm_mul_pd(dp0, jweta), dp1jweta = _mm_mul_pd(dp1, jweta), dp2jweta = _mm_mul_pd(dp2, jweta), p4du0p2dv1 = _mm_add_pd(_mm_mul_pd(p4, du0), _mm_mul_pd(p2, dv1)), /* 4 du0 + 2 dv1 */
1092               p4dv1p2du0 = _mm_add_pd(_mm_mul_pd(p4, dv1), _mm_mul_pd(p2, du0)), /* 4 dv1 + 2 du0 */
1093               pdu2dv2    = _mm_unpacklo_pd(du2, dv2),                            /* [du2, dv2] */
1094               du1pdv0    = _mm_add_pd(du1, dv0),                                 /* du1 + dv0 */
1095               t1         = _mm_mul_pd(dp0, p4du0p2dv1),                          /* dp0 (4 du0 + 2 dv1) */
1096               t2         = _mm_mul_pd(dp1, p4dv1p2du0);                          /* dp1 (4 dv1 + 2 du0) */
1097 
1098 #endif
1099 #if defined(COMPUTE_LOWER_TRIANGULAR)     /* The element matrices are always symmetric so computing the lower-triangular part is not necessary */
1100             for (ll = ls; ll < 8; ll++) { /* trial functions */
1101 #else
1102             for (ll = l; ll < 8; ll++) {
1103 #endif
1104               const PetscReal *PETSC_RESTRICT dpl = dphi[ll];
1105               if (amode == THIASSEMBLY_TRIDIAGONAL && (l - ll) % 4) continue; /* these entries would not be inserted */
1106 #if !USE_SSE2_KERNELS
1107               /* The analytic Jacobian in nice, easy-to-read form */
1108               {
1109                 PetscScalar dgdu, dgdv;
1110                 dgdu = 2. * du[0] * dpl[0] + dv[1] * dpl[0] + 0.5 * (du[1] + dv[0]) * dpl[1] + 0.5 * du[2] * dpl[2];
1111                 dgdv = 2. * dv[1] * dpl[1] + du[0] * dpl[1] + 0.5 * (du[1] + dv[0]) * dpl[0] + 0.5 * dv[2] * dpl[2];
1112                 /* Picard part */
1113                 Ke[l * 2 + 0][ll * 2 + 0] += dp[0] * jw * eta * 4. * dpl[0] + dp[1] * jw * eta * dpl[1] + dp[2] * jw * eta * dpl[2];
1114                 Ke[l * 2 + 0][ll * 2 + 1] += dp[0] * jw * eta * 2. * dpl[1] + dp[1] * jw * eta * dpl[0];
1115                 Ke[l * 2 + 1][ll * 2 + 0] += dp[1] * jw * eta * 2. * dpl[0] + dp[0] * jw * eta * dpl[1];
1116                 Ke[l * 2 + 1][ll * 2 + 1] += dp[1] * jw * eta * 4. * dpl[1] + dp[0] * jw * eta * dpl[0] + dp[2] * jw * eta * dpl[2];
1117                 /* extra Newton terms */
1118                 Ke[l * 2 + 0][ll * 2 + 0] += dp[0] * jw * deta * dgdu * (4. * du[0] + 2. * dv[1]) + dp[1] * jw * deta * dgdu * (du[1] + dv[0]) + dp[2] * jw * deta * dgdu * du[2];
1119                 Ke[l * 2 + 0][ll * 2 + 1] += dp[0] * jw * deta * dgdv * (4. * du[0] + 2. * dv[1]) + dp[1] * jw * deta * dgdv * (du[1] + dv[0]) + dp[2] * jw * deta * dgdv * du[2];
1120                 Ke[l * 2 + 1][ll * 2 + 0] += dp[1] * jw * deta * dgdu * (4. * dv[1] + 2. * du[0]) + dp[0] * jw * deta * dgdu * (du[1] + dv[0]) + dp[2] * jw * deta * dgdu * dv[2];
1121                 Ke[l * 2 + 1][ll * 2 + 1] += dp[1] * jw * deta * dgdv * (4. * dv[1] + 2. * du[0]) + dp[0] * jw * deta * dgdv * (du[1] + dv[0]) + dp[2] * jw * deta * dgdv * dv[2];
1122               }
1123 #else
1124               /* This SSE2 code is an exact replica of above, but uses explicit packed instructions for some speed
1125               * benefit.  On my hardware, these intrinsics are almost twice as fast as above, reducing total assembly cost
1126               * by 25 to 30 percent. */
1127               {
1128                 __m128d keu = _mm_loadu_pd(&Ke[l * 2 + 0][ll * 2 + 0]), kev = _mm_loadu_pd(&Ke[l * 2 + 1][ll * 2 + 0]), dpl01 = _mm_loadu_pd(&dpl[0]), dpl10 = _mm_shuffle_pd(dpl01, dpl01, _MM_SHUFFLE2(0, 1)), dpl2 = _mm_set_sd(dpl[2]), t0, t3, pdgduv;
1129                 keu    = _mm_add_pd(keu, _mm_add_pd(_mm_mul_pd(_mm_mul_pd(dp0jweta, p42), dpl01), _mm_add_pd(_mm_mul_pd(dp1jweta, dpl10), _mm_mul_pd(dp2jweta, dpl2))));
1130                 kev    = _mm_add_pd(kev, _mm_add_pd(_mm_mul_pd(_mm_mul_pd(dp1jweta, p24), dpl01), _mm_add_pd(_mm_mul_pd(dp0jweta, dpl10), _mm_mul_pd(dp2jweta, _mm_shuffle_pd(dpl2, dpl2, _MM_SHUFFLE2(0, 1))))));
1131                 pdgduv = _mm_mul_pd(p05, _mm_add_pd(_mm_add_pd(_mm_mul_pd(p42, _mm_mul_pd(du0, dpl01)), _mm_mul_pd(p24, _mm_mul_pd(dv1, dpl01))), _mm_add_pd(_mm_mul_pd(du1pdv0, dpl10), _mm_mul_pd(pdu2dv2, _mm_set1_pd(dpl[2]))))); /* [dgdu, dgdv] */
1132                 t0     = _mm_mul_pd(jwdeta, pdgduv); /* jw deta [dgdu, dgdv] */
1133                 t3     = _mm_mul_pd(t0, du1pdv0);    /* t0 (du1 + dv0) */
1134                 _mm_storeu_pd(&Ke[l * 2 + 0][ll * 2 + 0], _mm_add_pd(keu, _mm_add_pd(_mm_mul_pd(t1, t0), _mm_add_pd(_mm_mul_pd(dp1, t3), _mm_mul_pd(t0, _mm_mul_pd(dp2, du2))))));
1135                 _mm_storeu_pd(&Ke[l * 2 + 1][ll * 2 + 0], _mm_add_pd(kev, _mm_add_pd(_mm_mul_pd(t2, t0), _mm_add_pd(_mm_mul_pd(dp0, t3), _mm_mul_pd(t0, _mm_mul_pd(dp2, dv2))))));
1136               }
1137 #endif
1138             }
1139           }
1140         }
1141         if (k == 0) { /* on a bottom face */
1142           if (thi->no_slip) {
1143             const PetscReal   hz    = PetscRealPart(pn[0].h) / (zm - 1);
1144             const PetscScalar diagu = 2 * etabase / thi->rhog * (hx * hy / hz + hx * hz / hy + 4 * hy * hz / hx), diagv = 2 * etabase / thi->rhog * (hx * hy / hz + 4 * hx * hz / hy + hy * hz / hx);
1145             Ke[0][0] = thi->dirichlet_scale * diagu;
1146             Ke[1][1] = thi->dirichlet_scale * diagv;
1147           } else {
1148             for (q = 0; q < 4; q++) {
1149               const PetscReal jw = 0.25 * hx * hy / thi->rhog, *phi = QuadQInterp[q];
1150               PetscScalar     u = 0, v = 0, rbeta2 = 0;
1151               PetscReal       beta2, dbeta2;
1152               for (l = 0; l < 4; l++) {
1153                 u += phi[l] * n[l].u;
1154                 v += phi[l] * n[l].v;
1155                 rbeta2 += phi[l] * pn[l].beta2;
1156               }
1157               THIFriction(thi, PetscRealPart(rbeta2), PetscRealPart(u * u + v * v) / 2, &beta2, &dbeta2);
1158               for (l = 0; l < 4; l++) {
1159                 const PetscReal pp = phi[l];
1160                 for (ll = 0; ll < 4; ll++) {
1161                   const PetscReal ppl = phi[ll];
1162                   Ke[l * 2 + 0][ll * 2 + 0] += pp * jw * beta2 * ppl + pp * jw * dbeta2 * u * u * ppl;
1163                   Ke[l * 2 + 0][ll * 2 + 1] += pp * jw * dbeta2 * u * v * ppl;
1164                   Ke[l * 2 + 1][ll * 2 + 0] += pp * jw * dbeta2 * v * u * ppl;
1165                   Ke[l * 2 + 1][ll * 2 + 1] += pp * jw * beta2 * ppl + pp * jw * dbeta2 * v * v * ppl;
1166                 }
1167               }
1168             }
1169           }
1170         }
1171         {
1172           const MatStencil rc[8] = {
1173             {i,     j,     k,     0},
1174             {i + 1, j,     k,     0},
1175             {i + 1, j + 1, k,     0},
1176             {i,     j + 1, k,     0},
1177             {i,     j,     k + 1, 0},
1178             {i + 1, j,     k + 1, 0},
1179             {i + 1, j + 1, k + 1, 0},
1180             {i,     j + 1, k + 1, 0}
1181           };
1182           if (amode == THIASSEMBLY_TRIDIAGONAL) {
1183             for (l = 0; l < 4; l++) { /* Copy out each of the blocks, discarding horizontal coupling */
1184               const PetscInt   l4     = l + 4;
1185               const MatStencil rcl[2] = {
1186                 {rc[l].k,  rc[l].j,  rc[l].i,  0},
1187                 {rc[l4].k, rc[l4].j, rc[l4].i, 0}
1188               };
1189 #if defined(COMPUTE_LOWER_TRIANGULAR)
1190               const PetscScalar Kel[4][4] = {
1191                 {Ke[2 * l + 0][2 * l + 0],  Ke[2 * l + 0][2 * l + 1],  Ke[2 * l + 0][2 * l4 + 0],  Ke[2 * l + 0][2 * l4 + 1] },
1192                 {Ke[2 * l + 1][2 * l + 0],  Ke[2 * l + 1][2 * l + 1],  Ke[2 * l + 1][2 * l4 + 0],  Ke[2 * l + 1][2 * l4 + 1] },
1193                 {Ke[2 * l4 + 0][2 * l + 0], Ke[2 * l4 + 0][2 * l + 1], Ke[2 * l4 + 0][2 * l4 + 0], Ke[2 * l4 + 0][2 * l4 + 1]},
1194                 {Ke[2 * l4 + 1][2 * l + 0], Ke[2 * l4 + 1][2 * l + 1], Ke[2 * l4 + 1][2 * l4 + 0], Ke[2 * l4 + 1][2 * l4 + 1]}
1195               };
1196 #else
1197               /* Same as above except for the lower-left block */
1198               const PetscScalar Kel[4][4] = {
1199                 {Ke[2 * l + 0][2 * l + 0],  Ke[2 * l + 0][2 * l + 1],  Ke[2 * l + 0][2 * l4 + 0],  Ke[2 * l + 0][2 * l4 + 1] },
1200                 {Ke[2 * l + 1][2 * l + 0],  Ke[2 * l + 1][2 * l + 1],  Ke[2 * l + 1][2 * l4 + 0],  Ke[2 * l + 1][2 * l4 + 1] },
1201                 {Ke[2 * l + 0][2 * l4 + 0], Ke[2 * l + 1][2 * l4 + 0], Ke[2 * l4 + 0][2 * l4 + 0], Ke[2 * l4 + 0][2 * l4 + 1]},
1202                 {Ke[2 * l + 0][2 * l4 + 1], Ke[2 * l + 1][2 * l4 + 1], Ke[2 * l4 + 1][2 * l4 + 0], Ke[2 * l4 + 1][2 * l4 + 1]}
1203               };
1204 #endif
1205               PetscCall(MatSetValuesBlockedStencil(B, 2, rcl, 2, rcl, &Kel[0][0], ADD_VALUES));
1206             }
1207           } else {
1208 #if !defined(COMPUTE_LOWER_TRIANGULAR) /* fill in lower-triangular part, this is really cheap compared to computing the entries */
1209             for (l = 0; l < 8; l++) {
1210               for (ll = l + 1; ll < 8; ll++) {
1211                 Ke[ll * 2 + 0][l * 2 + 0] = Ke[l * 2 + 0][ll * 2 + 0];
1212                 Ke[ll * 2 + 1][l * 2 + 0] = Ke[l * 2 + 0][ll * 2 + 1];
1213                 Ke[ll * 2 + 0][l * 2 + 1] = Ke[l * 2 + 1][ll * 2 + 0];
1214                 Ke[ll * 2 + 1][l * 2 + 1] = Ke[l * 2 + 1][ll * 2 + 1];
1215               }
1216             }
1217 #endif
1218             PetscCall(MatSetValuesBlockedStencil(B, 8, rc, 8, rc, &Ke[0][0], ADD_VALUES));
1219           }
1220         }
1221       }
1222     }
1223   }
1224   PetscCall(THIDARestorePrm(info->da, &prm));
1225 
1226   PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1227   PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1228   PetscCall(MatSetOption(B, MAT_SYMMETRIC, PETSC_TRUE));
1229   if (thi->verbose) PetscCall(THIMatrixStatistics(thi, B, PETSC_VIEWER_STDOUT_WORLD));
1230   PetscFunctionReturn(PETSC_SUCCESS);
1231 }
1232 
1233 static PetscErrorCode THIJacobianLocal_3D_Full(DMDALocalInfo *info, Node ***x, Mat A, Mat B, THI thi)
1234 {
1235   PetscFunctionBeginUser;
1236   PetscCall(THIJacobianLocal_3D(info, x, B, thi, THIASSEMBLY_FULL));
1237   PetscFunctionReturn(PETSC_SUCCESS);
1238 }
1239 
1240 static PetscErrorCode THIJacobianLocal_3D_Tridiagonal(DMDALocalInfo *info, Node ***x, Mat A, Mat B, THI thi)
1241 {
1242   PetscFunctionBeginUser;
1243   PetscCall(THIJacobianLocal_3D(info, x, B, thi, THIASSEMBLY_TRIDIAGONAL));
1244   PetscFunctionReturn(PETSC_SUCCESS);
1245 }
1246 
1247 static PetscErrorCode DMRefineHierarchy_THI(DM dac0, PetscInt nlevels, DM hierarchy[])
1248 {
1249   THI             thi;
1250   PetscInt        dim, M, N, m, n, s, dof;
1251   DM              dac, daf;
1252   DMDAStencilType st;
1253   DM_DA          *ddf, *ddc;
1254 
1255   PetscFunctionBeginUser;
1256   PetscCall(PetscObjectQuery((PetscObject)dac0, "THI", (PetscObject *)&thi));
1257   PetscCheck(thi, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot refine this DMDA, missing composed THI instance");
1258   if (nlevels > 1) {
1259     PetscCall(DMRefineHierarchy(dac0, nlevels - 1, hierarchy));
1260     dac = hierarchy[nlevels - 2];
1261   } else {
1262     dac = dac0;
1263   }
1264   PetscCall(DMDAGetInfo(dac, &dim, &N, &M, 0, &n, &m, 0, &dof, &s, 0, 0, 0, &st));
1265   PetscCheck(dim == 2, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "This function can only refine 2D DMDAs");
1266 
1267   /* Creates a 3D DMDA with the same map-plane layout as the 2D one, with contiguous columns */
1268   PetscCall(DMDACreate3d(PetscObjectComm((PetscObject)dac), DM_BOUNDARY_NONE, DM_BOUNDARY_PERIODIC, DM_BOUNDARY_PERIODIC, st, thi->zlevels, N, M, 1, n, m, dof, s, NULL, NULL, NULL, &daf));
1269   PetscCall(DMSetUp(daf));
1270 
1271   daf->ops->creatematrix        = dac->ops->creatematrix;
1272   daf->ops->createinterpolation = dac->ops->createinterpolation;
1273   daf->ops->getcoloring         = dac->ops->getcoloring;
1274   ddf                           = (DM_DA *)daf->data;
1275   ddc                           = (DM_DA *)dac->data;
1276   ddf->interptype               = ddc->interptype;
1277 
1278   PetscCall(DMDASetFieldName(daf, 0, "x-velocity"));
1279   PetscCall(DMDASetFieldName(daf, 1, "y-velocity"));
1280 
1281   hierarchy[nlevels - 1] = daf;
1282   PetscFunctionReturn(PETSC_SUCCESS);
1283 }
1284 
1285 static PetscErrorCode DMCreateInterpolation_DA_THI(DM dac, DM daf, Mat *A, Vec *scale)
1286 {
1287   PetscInt dim;
1288 
1289   PetscFunctionBeginUser;
1290   PetscValidHeaderSpecific(dac, DM_CLASSID, 1);
1291   PetscValidHeaderSpecific(daf, DM_CLASSID, 2);
1292   PetscAssertPointer(A, 3);
1293   if (scale) PetscAssertPointer(scale, 4);
1294   PetscCall(DMDAGetInfo(daf, &dim, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
1295   if (dim == 2) {
1296     /* We are in the 2D problem and use normal DMDA interpolation */
1297     PetscCall(DMCreateInterpolation(dac, daf, A, scale));
1298   } else {
1299     PetscInt i, j, k, xs, ys, zs, xm, ym, zm, mx, my, mz, rstart, cstart;
1300     Mat      B;
1301 
1302     PetscCall(DMDAGetInfo(daf, 0, &mz, &my, &mx, 0, 0, 0, 0, 0, 0, 0, 0, 0));
1303     PetscCall(DMDAGetCorners(daf, &zs, &ys, &xs, &zm, &ym, &xm));
1304     PetscCheck(!zs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "unexpected");
1305     PetscCall(MatCreate(PetscObjectComm((PetscObject)daf), &B));
1306     PetscCall(MatSetSizes(B, xm * ym * zm, xm * ym, mx * my * mz, mx * my));
1307 
1308     PetscCall(MatSetType(B, MATAIJ));
1309     PetscCall(MatSeqAIJSetPreallocation(B, 1, NULL));
1310     PetscCall(MatMPIAIJSetPreallocation(B, 1, NULL, 0, NULL));
1311     PetscCall(MatGetOwnershipRange(B, &rstart, NULL));
1312     PetscCall(MatGetOwnershipRangeColumn(B, &cstart, NULL));
1313     for (i = xs; i < xs + xm; i++) {
1314       for (j = ys; j < ys + ym; j++) {
1315         for (k = zs; k < zs + zm; k++) {
1316           PetscInt    i2 = i * ym + j, i3 = i2 * zm + k;
1317           PetscScalar val = ((k == 0 || k == mz - 1) ? 0.5 : 1.) / (mz - 1.); /* Integration using trapezoid rule */
1318           PetscCall(MatSetValue(B, cstart + i3, rstart + i2, val, INSERT_VALUES));
1319         }
1320       }
1321     }
1322     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1323     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1324     PetscCall(MatCreateMAIJ(B, sizeof(Node) / sizeof(PetscScalar), A));
1325     PetscCall(MatDestroy(&B));
1326   }
1327   PetscFunctionReturn(PETSC_SUCCESS);
1328 }
1329 
1330 static PetscErrorCode DMCreateMatrix_THI_Tridiagonal(DM da, Mat *J)
1331 {
1332   Mat                    A;
1333   PetscInt               xm, ym, zm, dim, dof = 2, starts[3], dims[3];
1334   ISLocalToGlobalMapping ltog;
1335 
1336   PetscFunctionBeginUser;
1337   PetscCall(DMDAGetInfo(da, &dim, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
1338   PetscCheck(dim == 3, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Expected DMDA to be 3D");
1339   PetscCall(DMDAGetCorners(da, 0, 0, 0, &zm, &ym, &xm));
1340   PetscCall(DMGetLocalToGlobalMapping(da, &ltog));
1341   PetscCall(MatCreate(PetscObjectComm((PetscObject)da), &A));
1342   PetscCall(MatSetSizes(A, dof * xm * ym * zm, dof * xm * ym * zm, PETSC_DETERMINE, PETSC_DETERMINE));
1343   PetscCall(MatSetType(A, da->mattype));
1344   PetscCall(MatSetFromOptions(A));
1345   PetscCall(MatSeqAIJSetPreallocation(A, 3 * 2, NULL));
1346   PetscCall(MatMPIAIJSetPreallocation(A, 3 * 2, NULL, 0, NULL));
1347   PetscCall(MatSeqBAIJSetPreallocation(A, 2, 3, NULL));
1348   PetscCall(MatMPIBAIJSetPreallocation(A, 2, 3, NULL, 0, NULL));
1349   PetscCall(MatSeqSBAIJSetPreallocation(A, 2, 2, NULL));
1350   PetscCall(MatMPISBAIJSetPreallocation(A, 2, 2, NULL, 0, NULL));
1351   PetscCall(MatSetLocalToGlobalMapping(A, ltog, ltog));
1352   PetscCall(DMDAGetGhostCorners(da, &starts[0], &starts[1], &starts[2], &dims[0], &dims[1], &dims[2]));
1353   PetscCall(MatSetStencil(A, dim, dims, starts, dof));
1354   *J = A;
1355   PetscFunctionReturn(PETSC_SUCCESS);
1356 }
1357 
1358 static PetscErrorCode THIDAVecView_VTK_XML(THI thi, DM da, Vec X, const char filename[])
1359 {
1360   const PetscInt     dof   = 2;
1361   Units              units = thi->units;
1362   MPI_Comm           comm;
1363   PetscViewer        viewer;
1364   PetscMPIInt        rank, size, tag, nn, nmax;
1365   PetscInt           mx, my, mz, range[6];
1366   const PetscScalar *x;
1367 
1368   PetscFunctionBeginUser;
1369   PetscCall(PetscObjectGetComm((PetscObject)thi, &comm));
1370   PetscCall(DMDAGetInfo(da, 0, &mz, &my, &mx, 0, 0, 0, 0, 0, 0, 0, 0, 0));
1371   PetscCallMPI(MPI_Comm_size(comm, &size));
1372   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1373   PetscCall(PetscViewerASCIIOpen(comm, filename, &viewer));
1374   PetscCall(PetscViewerASCIIPrintf(viewer, "<VTKFile type=\"StructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n"));
1375   PetscCall(PetscViewerASCIIPrintf(viewer, "  <StructuredGrid WholeExtent=\"%d %" PetscInt_FMT " %d %" PetscInt_FMT " %d %" PetscInt_FMT "\">\n", 0, mz - 1, 0, my - 1, 0, mx - 1));
1376 
1377   PetscCall(DMDAGetCorners(da, range, range + 1, range + 2, range + 3, range + 4, range + 5));
1378   PetscCall(PetscMPIIntCast(range[3] * range[4] * range[5] * dof, &nn));
1379   PetscCallMPI(MPI_Reduce(&nn, &nmax, 1, MPI_INT, MPI_MAX, 0, comm));
1380   tag = ((PetscObject)viewer)->tag;
1381   PetscCall(VecGetArrayRead(X, &x));
1382   if (rank == 0) {
1383     PetscScalar *array;
1384     PetscCall(PetscMalloc1(nmax, &array));
1385     for (PetscMPIInt r = 0; r < size; r++) {
1386       PetscInt           i, j, k, xs, xm, ys, ym, zs, zm;
1387       const PetscScalar *ptr;
1388       MPI_Status         status;
1389       if (r) PetscCallMPI(MPIU_Recv(range, 6, MPIU_INT, r, tag, comm, MPI_STATUS_IGNORE));
1390       zs = range[0];
1391       ys = range[1];
1392       xs = range[2];
1393       zm = range[3];
1394       ym = range[4];
1395       xm = range[5];
1396       PetscCheck(xm * ym * zm * dof <= nmax, PETSC_COMM_SELF, PETSC_ERR_PLIB, "should not happen");
1397       if (r) {
1398         PetscCallMPI(MPIU_Recv(array, nmax, MPIU_SCALAR, r, tag, comm, &status));
1399         PetscCallMPI(MPI_Get_count(&status, MPIU_SCALAR, &nn));
1400         PetscCheck(nn == xm * ym * zm * dof, PETSC_COMM_SELF, PETSC_ERR_PLIB, "should not happen");
1401         ptr = array;
1402       } else ptr = x;
1403       PetscCall(PetscViewerASCIIPrintf(viewer, "    <Piece Extent=\"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\">\n", zs, zs + zm - 1, ys, ys + ym - 1, xs, xs + xm - 1));
1404 
1405       PetscCall(PetscViewerASCIIPrintf(viewer, "      <Points>\n"));
1406       PetscCall(PetscViewerASCIIPrintf(viewer, "        <DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"ascii\">\n"));
1407       for (i = xs; i < xs + xm; i++) {
1408         for (j = ys; j < ys + ym; j++) {
1409           for (k = zs; k < zs + zm; k++) {
1410             PrmNode   p;
1411             PetscReal xx = thi->Lx * i / mx, yy = thi->Ly * j / my, zz;
1412             thi->initialize(thi, xx, yy, &p);
1413             zz = PetscRealPart(p.b) + PetscRealPart(p.h) * k / (mz - 1);
1414             PetscCall(PetscViewerASCIIPrintf(viewer, "%f %f %f\n", (double)xx, (double)yy, (double)zz));
1415           }
1416         }
1417       }
1418       PetscCall(PetscViewerASCIIPrintf(viewer, "        </DataArray>\n"));
1419       PetscCall(PetscViewerASCIIPrintf(viewer, "      </Points>\n"));
1420 
1421       PetscCall(PetscViewerASCIIPrintf(viewer, "      <PointData>\n"));
1422       PetscCall(PetscViewerASCIIPrintf(viewer, "        <DataArray type=\"Float32\" Name=\"velocity\" NumberOfComponents=\"3\" format=\"ascii\">\n"));
1423       for (i = 0; i < nn; i += dof) PetscCall(PetscViewerASCIIPrintf(viewer, "%f %f %f\n", (double)(PetscRealPart(ptr[i]) * units->year / units->meter), (double)(PetscRealPart(ptr[i + 1]) * units->year / units->meter), 0.0));
1424       PetscCall(PetscViewerASCIIPrintf(viewer, "        </DataArray>\n"));
1425 
1426       PetscCall(PetscViewerASCIIPrintf(viewer, "        <DataArray type=\"Int32\" Name=\"rank\" NumberOfComponents=\"1\" format=\"ascii\">\n"));
1427       for (i = 0; i < nn; i += dof) PetscCall(PetscViewerASCIIPrintf(viewer, "%d\n", r));
1428       PetscCall(PetscViewerASCIIPrintf(viewer, "        </DataArray>\n"));
1429       PetscCall(PetscViewerASCIIPrintf(viewer, "      </PointData>\n"));
1430 
1431       PetscCall(PetscViewerASCIIPrintf(viewer, "    </Piece>\n"));
1432     }
1433     PetscCall(PetscFree(array));
1434   } else {
1435     PetscCallMPI(MPIU_Send(range, 6, MPIU_INT, 0, tag, comm));
1436     PetscCallMPI(MPIU_Send((PetscScalar *)x, nn, MPIU_SCALAR, 0, tag, comm));
1437   }
1438   PetscCall(VecRestoreArrayRead(X, &x));
1439   PetscCall(PetscViewerASCIIPrintf(viewer, "  </StructuredGrid>\n"));
1440   PetscCall(PetscViewerASCIIPrintf(viewer, "</VTKFile>\n"));
1441   PetscCall(PetscViewerDestroy(&viewer));
1442   PetscFunctionReturn(PETSC_SUCCESS);
1443 }
1444 
1445 int main(int argc, char *argv[])
1446 {
1447   MPI_Comm comm;
1448   THI      thi;
1449   DM       da;
1450   SNES     snes;
1451 
1452   PetscFunctionBeginUser;
1453   PetscCall(PetscInitialize(&argc, &argv, 0, help));
1454   comm = PETSC_COMM_WORLD;
1455 
1456   PetscCall(THICreate(comm, &thi));
1457   {
1458     PetscInt M = 3, N = 3, P = 2;
1459     PetscOptionsBegin(comm, NULL, "Grid resolution options", "");
1460     {
1461       PetscCall(PetscOptionsInt("-M", "Number of elements in x-direction on coarse level", "", M, &M, NULL));
1462       N = M;
1463       PetscCall(PetscOptionsInt("-N", "Number of elements in y-direction on coarse level (if different from M)", "", N, &N, NULL));
1464       if (thi->coarse2d) {
1465         PetscCall(PetscOptionsInt("-zlevels", "Number of elements in z-direction on fine level", "", thi->zlevels, &thi->zlevels, NULL));
1466       } else {
1467         PetscCall(PetscOptionsInt("-P", "Number of elements in z-direction on coarse level", "", P, &P, NULL));
1468       }
1469     }
1470     PetscOptionsEnd();
1471     if (thi->coarse2d) {
1472       PetscCall(DMDACreate2d(comm, DM_BOUNDARY_PERIODIC, DM_BOUNDARY_PERIODIC, DMDA_STENCIL_BOX, N, M, PETSC_DETERMINE, PETSC_DETERMINE, sizeof(Node) / sizeof(PetscScalar), 1, 0, 0, &da));
1473       PetscCall(DMSetFromOptions(da));
1474       PetscCall(DMSetUp(da));
1475       da->ops->refinehierarchy     = DMRefineHierarchy_THI;
1476       da->ops->createinterpolation = DMCreateInterpolation_DA_THI;
1477 
1478       PetscCall(PetscObjectCompose((PetscObject)da, "THI", (PetscObject)thi));
1479     } else {
1480       PetscCall(DMDACreate3d(comm, DM_BOUNDARY_NONE, DM_BOUNDARY_PERIODIC, DM_BOUNDARY_PERIODIC, DMDA_STENCIL_BOX, P, N, M, 1, PETSC_DETERMINE, PETSC_DETERMINE, sizeof(Node) / sizeof(PetscScalar), 1, 0, 0, 0, &da));
1481       PetscCall(DMSetFromOptions(da));
1482       PetscCall(DMSetUp(da));
1483     }
1484     PetscCall(DMDASetFieldName(da, 0, "x-velocity"));
1485     PetscCall(DMDASetFieldName(da, 1, "y-velocity"));
1486   }
1487   PetscCall(THISetUpDM(thi, da));
1488   if (thi->tridiagonal) da->ops->creatematrix = DMCreateMatrix_THI_Tridiagonal;
1489 
1490   { /* Set the fine level matrix type if -da_refine */
1491     PetscInt rlevel, clevel;
1492     PetscCall(DMGetRefineLevel(da, &rlevel));
1493     PetscCall(DMGetCoarsenLevel(da, &clevel));
1494     if (rlevel - clevel > 0) PetscCall(DMSetMatType(da, thi->mattype));
1495   }
1496 
1497   PetscCall(DMDASNESSetFunctionLocal(da, ADD_VALUES, (DMDASNESFunctionFn *)THIFunctionLocal, thi));
1498   if (thi->tridiagonal) {
1499     PetscCall(DMDASNESSetJacobianLocal(da, (DMDASNESJacobianFn *)THIJacobianLocal_3D_Tridiagonal, thi));
1500   } else {
1501     PetscCall(DMDASNESSetJacobianLocal(da, (DMDASNESJacobianFn *)THIJacobianLocal_3D_Full, thi));
1502   }
1503   PetscCall(DMCoarsenHookAdd(da, DMCoarsenHook_THI, NULL, thi));
1504   PetscCall(DMRefineHookAdd(da, DMRefineHook_THI, NULL, thi));
1505 
1506   PetscCall(DMSetApplicationContext(da, thi));
1507 
1508   PetscCall(SNESCreate(comm, &snes));
1509   PetscCall(SNESSetDM(snes, da));
1510   PetscCall(DMDestroy(&da));
1511   PetscCall(SNESSetComputeInitialGuess(snes, THIInitial, NULL));
1512   PetscCall(SNESSetFromOptions(snes));
1513 
1514   PetscCall(SNESSolve(snes, NULL, NULL));
1515 
1516   PetscCall(THISolveStatistics(thi, snes, 0, "Full"));
1517 
1518   {
1519     PetscBool flg;
1520     char      filename[PETSC_MAX_PATH_LEN] = "";
1521     PetscCall(PetscOptionsGetString(NULL, NULL, "-o", filename, sizeof(filename), &flg));
1522     if (flg) {
1523       Vec X;
1524       DM  dm;
1525       PetscCall(SNESGetSolution(snes, &X));
1526       PetscCall(SNESGetDM(snes, &dm));
1527       PetscCall(THIDAVecView_VTK_XML(thi, dm, X, filename));
1528     }
1529   }
1530 
1531   PetscCall(DMDestroy(&da));
1532   PetscCall(SNESDestroy(&snes));
1533   PetscCall(THIDestroy(&thi));
1534   PetscCall(PetscFinalize());
1535   return 0;
1536 }
1537 
1538 /*TEST
1539 
1540    build:
1541       requires: !single
1542 
1543    test:
1544       args: -M 6 -P 4 -da_refine 1 -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type sbaij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type icc
1545 
1546    test:
1547       suffix: 2
1548       nsize: 2
1549       args: -M 6 -P 4 -thi_hom z -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type sbaij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type asm -mg_levels_pc_asm_blocks 6 -mg_levels_0_pc_type redundant -snes_grid_sequence 1 -mat_partitioning_type current -ksp_atol 0
1550 
1551    test:
1552       suffix: 3
1553       nsize: 3
1554       args: -M 7 -P 4 -thi_hom z -da_refine 1 -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type baij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_pc_asm_type restrict -mg_levels_pc_type asm -mg_levels_pc_asm_blocks 9 -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mat_partitioning_type current
1555 
1556    test:
1557       suffix: 4
1558       nsize: 6
1559       args: -M 4 -P 2 -da_refine_hierarchy_x 1,1,3 -da_refine_hierarchy_y 2,2,1 -da_refine_hierarchy_z 2,2,1 -snes_grid_sequence 3 -ksp_converged_reason -ksp_type fgmres -ksp_rtol 1e-2 -pc_type mg -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type bjacobi -mg_levels_1_sub_pc_type cholesky -pc_mg_type multiplicative -snes_converged_reason -snes_stol 1e-12 -thi_L 80e3 -thi_alpha 0.05 -thi_friction_m 1 -thi_hom x -snes_view -mg_levels_0_pc_type redundant -mg_levels_0_ksp_type preonly -ksp_atol 0
1560 
1561    test:
1562       suffix: 5
1563       nsize: 6
1564       args: -M 12 -P 5 -snes_monitor_short -ksp_converged_reason -pc_type asm -pc_asm_type restrict -dm_mat_type {{aij baij sbaij}}
1565 
1566 TEST*/
1567