xref: /petsc/src/snes/tutorials/ex48.c (revision 7a101e5e7ba9859de4c800924a501d6ea3cd325c)
1 static const char help[] = "Toy hydrostatic ice flow with multigrid in 3D.\n\
2 \n\
3 Solves the hydrostatic (aka Blatter/Pattyn/First Order) equations for ice sheet flow\n\
4 using multigrid.  The ice uses a power-law rheology with \"Glen\" exponent 3 (corresponds\n\
5 to p=4/3 in a p-Laplacian).  The focus is on ISMIP-HOM experiments which assume periodic\n\
6 boundary conditions in the x- and y-directions.\n\
7 \n\
8 Equations are rescaled so that the domain size and solution are O(1), details of this scaling\n\
9 can be controlled by the options -units_meter, -units_second, and -units_kilogram.\n\
10 \n\
11 A VTK StructuredGrid output file can be written using the option -o filename.vts\n\
12 \n\n";
13 
14 /*
15 The equations for horizontal velocity (u,v) are
16 
17   - [eta (4 u_x + 2 v_y)]_x - [eta (u_y + v_x)]_y - [eta u_z]_z + rho g s_x = 0
18   - [eta (4 v_y + 2 u_x)]_y - [eta (u_y + v_x)]_x - [eta v_z]_z + rho g s_y = 0
19 
20 where
21 
22   eta = B/2 (epsilon + gamma)^((p-2)/2)
23 
24 is the nonlinear effective viscosity with regularization epsilon and hardness parameter B,
25 written in terms of the second invariant
26 
27   gamma = u_x^2 + v_y^2 + u_x v_y + (1/4) (u_y + v_x)^2 + (1/4) u_z^2 + (1/4) v_z^2
28 
29 The surface boundary conditions are the natural conditions.  The basal boundary conditions
30 are either no-slip, or Navier (linear) slip with spatially variant friction coefficient beta^2.
31 
32 In the code, the equations for (u,v) are multiplied through by 1/(rho g) so that residuals are O(1).
33 
34 The discretization is Q1 finite elements, managed by a DMDA.  The grid is never distorted in the
35 map (x,y) plane, but the bed and surface may be bumpy.  This is handled as usual in FEM, through
36 the Jacobian of the coordinate transformation from a reference element to the physical element.
37 
38 Since ice-flow is tightly coupled in the z-direction (within columns), the DMDA is managed
39 specially so that columns are never distributed, and are always contiguous in memory.
40 This amounts to reversing the meaning of X,Y,Z compared to the DMDA's internal interpretation,
41 and then indexing as vec[i][j][k].  The exotic coarse spaces require 2D DMDAs which are made to
42 use compatible domain decomposition relative to the 3D DMDAs.
43 
44 There are two compile-time options:
45 
46   NO_SSE2:
47     If the host supports SSE2, we use integration code that has been vectorized with SSE2
48     intrinsics, unless this macro is defined.  The intrinsics speed up integration by about
49     30% on my architecture (P8700, gcc-4.5 snapshot).
50 
51   COMPUTE_LOWER_TRIANGULAR:
52     The element matrices we assemble are lower-triangular so it is not necessary to compute
53     all entries explicitly.  If this macro is defined, the lower-triangular entries are
54     computed explicitly.
55 
56 */
57 
58 #if defined(PETSC_APPLE_FRAMEWORK)
59 #import <PETSc/petscsnes.h>
60 #import <PETSc/petsc/private/dmdaimpl.h>     /* There is not yet a public interface to manipulate dm->ops */
61 #else
62 
63 #include <petscsnes.h>
64 #include <petsc/private/dmdaimpl.h>     /* There is not yet a public interface to manipulate dm->ops */
65 #endif
66 #include <ctype.h>              /* toupper() */
67 
68 #if defined(__cplusplus) || defined (PETSC_HAVE_WINDOWS_COMPILERS) || defined (__PGI)
69 /*  c++ cannot handle  [_restrict_] notation like C does */
70 #undef PETSC_RESTRICT
71 #define PETSC_RESTRICT
72 #endif
73 
74 #if defined __SSE2__
75 #  include <emmintrin.h>
76 #endif
77 
78 /* The SSE2 kernels are only for PetscScalar=double on architectures that support it */
79 #if !defined NO_SSE2                           \
80      && !defined PETSC_USE_COMPLEX             \
81      && !defined PETSC_USE_REAL_SINGLE         \
82      && !defined PETSC_USE_REAL___FLOAT128     \
83      && !defined PETSC_USE_REAL___FP16         \
84      && defined __SSE2__
85 #define USE_SSE2_KERNELS 1
86 #else
87 #define USE_SSE2_KERNELS 0
88 #endif
89 
90 static PetscClassId THI_CLASSID;
91 
92 typedef enum {QUAD_GAUSS,QUAD_LOBATTO} QuadratureType;
93 static const char      *QuadratureTypes[] = {"gauss","lobatto","QuadratureType","QUAD_",0};
94 PETSC_UNUSED static const PetscReal HexQWeights[8]     = {1,1,1,1,1,1,1,1};
95 PETSC_UNUSED static const PetscReal HexQNodes[]        = {-0.57735026918962573, 0.57735026918962573};
96 #define G 0.57735026918962573
97 #define H (0.5*(1.+G))
98 #define L (0.5*(1.-G))
99 #define M (-0.5)
100 #define P (0.5)
101 /* Special quadrature: Lobatto in horizontal, Gauss in vertical */
102 static const PetscReal HexQInterp_Lobatto[8][8] = {{H,0,0,0,L,0,0,0},
103                                                    {0,H,0,0,0,L,0,0},
104                                                    {0,0,H,0,0,0,L,0},
105                                                    {0,0,0,H,0,0,0,L},
106                                                    {L,0,0,0,H,0,0,0},
107                                                    {0,L,0,0,0,H,0,0},
108                                                    {0,0,L,0,0,0,H,0},
109                                                    {0,0,0,L,0,0,0,H}};
110 static const PetscReal HexQDeriv_Lobatto[8][8][3] = {
111   {{M*H,M*H,M},{P*H,0,0}  ,{0,0,0}    ,{0,P*H,0}  ,{M*L,M*L,P},{P*L,0,0}  ,{0,0,0}    ,{0,P*L,0}  },
112   {{M*H,0,0}  ,{P*H,M*H,M},{0,P*H,0}  ,{0,0,0}    ,{M*L,0,0}  ,{P*L,M*L,P},{0,P*L,0}  ,{0,0,0}    },
113   {{0,0,0}    ,{0,M*H,0}  ,{P*H,P*H,M},{M*H,0,0}  ,{0,0,0}    ,{0,M*L,0}  ,{P*L,P*L,P},{M*L,0,0}  },
114   {{0,M*H,0}  ,{0,0,0}    ,{P*H,0,0}  ,{M*H,P*H,M},{0,M*L,0}  ,{0,0,0}    ,{P*L,0,0}  ,{M*L,P*L,P}},
115   {{M*L,M*L,M},{P*L,0,0}  ,{0,0,0}    ,{0,P*L,0}  ,{M*H,M*H,P},{P*H,0,0}  ,{0,0,0}    ,{0,P*H,0}  },
116   {{M*L,0,0}  ,{P*L,M*L,M},{0,P*L,0}  ,{0,0,0}    ,{M*H,0,0}  ,{P*H,M*H,P},{0,P*H,0}  ,{0,0,0}    },
117   {{0,0,0}    ,{0,M*L,0}  ,{P*L,P*L,M},{M*L,0,0}  ,{0,0,0}    ,{0,M*H,0}  ,{P*H,P*H,P},{M*H,0,0}  },
118   {{0,M*L,0}  ,{0,0,0}    ,{P*L,0,0}  ,{M*L,P*L,M},{0,M*H,0}  ,{0,0,0}    ,{P*H,0,0}  ,{M*H,P*H,P}}};
119 /* Stanndard Gauss */
120 static const PetscReal HexQInterp_Gauss[8][8] = {{H*H*H,L*H*H,L*L*H,H*L*H, H*H*L,L*H*L,L*L*L,H*L*L},
121                                                  {L*H*H,H*H*H,H*L*H,L*L*H, L*H*L,H*H*L,H*L*L,L*L*L},
122                                                  {L*L*H,H*L*H,H*H*H,L*H*H, L*L*L,H*L*L,H*H*L,L*H*L},
123                                                  {H*L*H,L*L*H,L*H*H,H*H*H, H*L*L,L*L*L,L*H*L,H*H*L},
124                                                  {H*H*L,L*H*L,L*L*L,H*L*L, H*H*H,L*H*H,L*L*H,H*L*H},
125                                                  {L*H*L,H*H*L,H*L*L,L*L*L, L*H*H,H*H*H,H*L*H,L*L*H},
126                                                  {L*L*L,H*L*L,H*H*L,L*H*L, L*L*H,H*L*H,H*H*H,L*H*H},
127                                                  {H*L*L,L*L*L,L*H*L,H*H*L, H*L*H,L*L*H,L*H*H,H*H*H}};
128 static const PetscReal HexQDeriv_Gauss[8][8][3] = {
129   {{M*H*H,H*M*H,H*H*M},{P*H*H,L*M*H,L*H*M},{P*L*H,L*P*H,L*L*M},{M*L*H,H*P*H,H*L*M}, {M*H*L,H*M*L,H*H*P},{P*H*L,L*M*L,L*H*P},{P*L*L,L*P*L,L*L*P},{M*L*L,H*P*L,H*L*P}},
130   {{M*H*H,L*M*H,L*H*M},{P*H*H,H*M*H,H*H*M},{P*L*H,H*P*H,H*L*M},{M*L*H,L*P*H,L*L*M}, {M*H*L,L*M*L,L*H*P},{P*H*L,H*M*L,H*H*P},{P*L*L,H*P*L,H*L*P},{M*L*L,L*P*L,L*L*P}},
131   {{M*L*H,L*M*H,L*L*M},{P*L*H,H*M*H,H*L*M},{P*H*H,H*P*H,H*H*M},{M*H*H,L*P*H,L*H*M}, {M*L*L,L*M*L,L*L*P},{P*L*L,H*M*L,H*L*P},{P*H*L,H*P*L,H*H*P},{M*H*L,L*P*L,L*H*P}},
132   {{M*L*H,H*M*H,H*L*M},{P*L*H,L*M*H,L*L*M},{P*H*H,L*P*H,L*H*M},{M*H*H,H*P*H,H*H*M}, {M*L*L,H*M*L,H*L*P},{P*L*L,L*M*L,L*L*P},{P*H*L,L*P*L,L*H*P},{M*H*L,H*P*L,H*H*P}},
133   {{M*H*L,H*M*L,H*H*M},{P*H*L,L*M*L,L*H*M},{P*L*L,L*P*L,L*L*M},{M*L*L,H*P*L,H*L*M}, {M*H*H,H*M*H,H*H*P},{P*H*H,L*M*H,L*H*P},{P*L*H,L*P*H,L*L*P},{M*L*H,H*P*H,H*L*P}},
134   {{M*H*L,L*M*L,L*H*M},{P*H*L,H*M*L,H*H*M},{P*L*L,H*P*L,H*L*M},{M*L*L,L*P*L,L*L*M}, {M*H*H,L*M*H,L*H*P},{P*H*H,H*M*H,H*H*P},{P*L*H,H*P*H,H*L*P},{M*L*H,L*P*H,L*L*P}},
135   {{M*L*L,L*M*L,L*L*M},{P*L*L,H*M*L,H*L*M},{P*H*L,H*P*L,H*H*M},{M*H*L,L*P*L,L*H*M}, {M*L*H,L*M*H,L*L*P},{P*L*H,H*M*H,H*L*P},{P*H*H,H*P*H,H*H*P},{M*H*H,L*P*H,L*H*P}},
136   {{M*L*L,H*M*L,H*L*M},{P*L*L,L*M*L,L*L*M},{P*H*L,L*P*L,L*H*M},{M*H*L,H*P*L,H*H*M}, {M*L*H,H*M*H,H*L*P},{P*L*H,L*M*H,L*L*P},{P*H*H,L*P*H,L*H*P},{M*H*H,H*P*H,H*H*P}}};
137 static const PetscReal (*HexQInterp)[8],(*HexQDeriv)[8][3];
138 /* Standard 2x2 Gauss quadrature for the bottom layer. */
139 static const PetscReal QuadQInterp[4][4] = {{H*H,L*H,L*L,H*L},
140                                             {L*H,H*H,H*L,L*L},
141                                             {L*L,H*L,H*H,L*H},
142                                             {H*L,L*L,L*H,H*H}};
143 static const PetscReal QuadQDeriv[4][4][2] = {
144   {{M*H,M*H},{P*H,M*L},{P*L,P*L},{M*L,P*H}},
145   {{M*H,M*L},{P*H,M*H},{P*L,P*H},{M*L,P*L}},
146   {{M*L,M*L},{P*L,M*H},{P*H,P*H},{M*H,P*L}},
147   {{M*L,M*H},{P*L,M*L},{P*H,P*L},{M*H,P*H}}};
148 #undef G
149 #undef H
150 #undef L
151 #undef M
152 #undef P
153 
154 #define HexExtract(x,i,j,k,n) do {              \
155     (n)[0] = (x)[i][j][k];                      \
156     (n)[1] = (x)[i+1][j][k];                    \
157     (n)[2] = (x)[i+1][j+1][k];                  \
158     (n)[3] = (x)[i][j+1][k];                    \
159     (n)[4] = (x)[i][j][k+1];                    \
160     (n)[5] = (x)[i+1][j][k+1];                  \
161     (n)[6] = (x)[i+1][j+1][k+1];                \
162     (n)[7] = (x)[i][j+1][k+1];                  \
163   } while (0)
164 
165 #define HexExtractRef(x,i,j,k,n) do {           \
166     (n)[0] = &(x)[i][j][k];                     \
167     (n)[1] = &(x)[i+1][j][k];                   \
168     (n)[2] = &(x)[i+1][j+1][k];                 \
169     (n)[3] = &(x)[i][j+1][k];                   \
170     (n)[4] = &(x)[i][j][k+1];                   \
171     (n)[5] = &(x)[i+1][j][k+1];                 \
172     (n)[6] = &(x)[i+1][j+1][k+1];               \
173     (n)[7] = &(x)[i][j+1][k+1];                 \
174   } while (0)
175 
176 #define QuadExtract(x,i,j,n) do {               \
177     (n)[0] = (x)[i][j];                         \
178     (n)[1] = (x)[i+1][j];                       \
179     (n)[2] = (x)[i+1][j+1];                     \
180     (n)[3] = (x)[i][j+1];                       \
181   } while (0)
182 
183 static void HexGrad(const PetscReal dphi[][3],const PetscReal zn[],PetscReal dz[])
184 {
185   PetscInt i;
186   dz[0] = dz[1] = dz[2] = 0;
187   for (i=0; i<8; i++) {
188     dz[0] += dphi[i][0] * zn[i];
189     dz[1] += dphi[i][1] * zn[i];
190     dz[2] += dphi[i][2] * zn[i];
191   }
192 }
193 
194 static void HexComputeGeometry(PetscInt q,PetscReal hx,PetscReal hy,const PetscReal dz[PETSC_RESTRICT],PetscReal phi[PETSC_RESTRICT],PetscReal dphi[PETSC_RESTRICT][3],PetscReal *PETSC_RESTRICT jw)
195 {
196   const PetscReal jac[3][3]  = {{hx/2,0,0}, {0,hy/2,0}, {dz[0],dz[1],dz[2]}};
197   const PetscReal ijac[3][3] = {{1/jac[0][0],0,0}, {0,1/jac[1][1],0}, {-jac[2][0]/(jac[0][0]*jac[2][2]),-jac[2][1]/(jac[1][1]*jac[2][2]),1/jac[2][2]}};
198   const PetscReal jdet       = jac[0][0]*jac[1][1]*jac[2][2];
199   PetscInt        i;
200 
201   for (i=0; i<8; i++) {
202     const PetscReal *dphir = HexQDeriv[q][i];
203     phi[i]     = HexQInterp[q][i];
204     dphi[i][0] = dphir[0]*ijac[0][0] + dphir[1]*ijac[1][0] + dphir[2]*ijac[2][0];
205     dphi[i][1] = dphir[0]*ijac[0][1] + dphir[1]*ijac[1][1] + dphir[2]*ijac[2][1];
206     dphi[i][2] = dphir[0]*ijac[0][2] + dphir[1]*ijac[1][2] + dphir[2]*ijac[2][2];
207   }
208   *jw = 1.0 * jdet;
209 }
210 
211 typedef struct _p_THI   *THI;
212 typedef struct _n_Units *Units;
213 
214 typedef struct {
215   PetscScalar u,v;
216 } Node;
217 
218 typedef struct {
219   PetscScalar b;                /* bed */
220   PetscScalar h;                /* thickness */
221   PetscScalar beta2;            /* friction */
222 } PrmNode;
223 
224 typedef struct {
225   PetscReal min,max,cmin,cmax;
226 } PRange;
227 
228 typedef enum {THIASSEMBLY_TRIDIAGONAL,THIASSEMBLY_FULL} THIAssemblyMode;
229 
230 struct _p_THI {
231   PETSCHEADER(int);
232   void      (*initialize)(THI,PetscReal x,PetscReal y,PrmNode *p);
233   PetscInt  zlevels;
234   PetscReal Lx,Ly,Lz;           /* Model domain */
235   PetscReal alpha;              /* Bed angle */
236   Units     units;
237   PetscReal dirichlet_scale;
238   PetscReal ssa_friction_scale;
239   PRange    eta;
240   PRange    beta2;
241   struct {
242     PetscReal Bd2,eps,exponent;
243   } viscosity;
244   struct {
245     PetscReal irefgam,eps2,exponent,refvel,epsvel;
246   } friction;
247   PetscReal rhog;
248   PetscBool no_slip;
249   PetscBool tridiagonal;
250   PetscBool coarse2d;
251   PetscBool verbose;
252   MatType   mattype;
253 };
254 
255 struct _n_Units {
256   /* fundamental */
257   PetscReal meter;
258   PetscReal kilogram;
259   PetscReal second;
260   /* derived */
261   PetscReal Pascal;
262   PetscReal year;
263 };
264 
265 static PetscErrorCode THIJacobianLocal_3D_Full(DMDALocalInfo*,Node***,Mat,Mat,THI);
266 static PetscErrorCode THIJacobianLocal_3D_Tridiagonal(DMDALocalInfo*,Node***,Mat,Mat,THI);
267 static PetscErrorCode THIJacobianLocal_2D(DMDALocalInfo*,Node**,Mat,Mat,THI);
268 
269 static void PrmHexGetZ(const PrmNode pn[],PetscInt k,PetscInt zm,PetscReal zn[])
270 {
271   const PetscScalar zm1    = zm-1,
272                     znl[8] = {pn[0].b + pn[0].h*(PetscScalar)k/zm1,
273                               pn[1].b + pn[1].h*(PetscScalar)k/zm1,
274                               pn[2].b + pn[2].h*(PetscScalar)k/zm1,
275                               pn[3].b + pn[3].h*(PetscScalar)k/zm1,
276                               pn[0].b + pn[0].h*(PetscScalar)(k+1)/zm1,
277                               pn[1].b + pn[1].h*(PetscScalar)(k+1)/zm1,
278                               pn[2].b + pn[2].h*(PetscScalar)(k+1)/zm1,
279                               pn[3].b + pn[3].h*(PetscScalar)(k+1)/zm1};
280   PetscInt i;
281   for (i=0; i<8; i++) zn[i] = PetscRealPart(znl[i]);
282 }
283 
284 /* Tests A and C are from the ISMIP-HOM paper (Pattyn et al. 2008) */
285 static void THIInitialize_HOM_A(THI thi,PetscReal x,PetscReal y,PrmNode *p)
286 {
287   Units     units = thi->units;
288   PetscReal s     = -x*PetscSinReal(thi->alpha);
289 
290   p->b     = s - 1000*units->meter + 500*units->meter * PetscSinReal(x*2*PETSC_PI/thi->Lx) * PetscSinReal(y*2*PETSC_PI/thi->Ly);
291   p->h     = s - p->b;
292   p->beta2 = 1e30;
293 }
294 
295 static void THIInitialize_HOM_C(THI thi,PetscReal x,PetscReal y,PrmNode *p)
296 {
297   Units     units = thi->units;
298   PetscReal s     = -x*PetscSinReal(thi->alpha);
299 
300   p->b = s - 1000*units->meter;
301   p->h = s - p->b;
302   /* tau_b = beta2 v   is a stress (Pa) */
303   p->beta2 = 1000 * (1 + PetscSinReal(x*2*PETSC_PI/thi->Lx)*PetscSinReal(y*2*PETSC_PI/thi->Ly)) * units->Pascal * units->year / units->meter;
304 }
305 
306 /* These are just toys */
307 
308 /* Same bed as test A, free slip everywhere except for a discontinuous jump to a circular sticky region in the middle. */
309 static void THIInitialize_HOM_X(THI thi,PetscReal xx,PetscReal yy,PrmNode *p)
310 {
311   Units     units = thi->units;
312   PetscReal x     = xx*2*PETSC_PI/thi->Lx - PETSC_PI,y = yy*2*PETSC_PI/thi->Ly - PETSC_PI; /* [-pi,pi] */
313   PetscReal r     = PetscSqrtReal(x*x + y*y),s = -x*PetscSinReal(thi->alpha);
314   p->b     = s - 1000*units->meter + 500*units->meter*PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
315   p->h     = s - p->b;
316   p->beta2 = 1000 * (r < 1 ? 2 : 0) * units->Pascal * units->year / units->meter;
317 }
318 
319 /* Like Z, but with 200 meter cliffs */
320 static void THIInitialize_HOM_Y(THI thi,PetscReal xx,PetscReal yy,PrmNode *p)
321 {
322   Units     units = thi->units;
323   PetscReal x     = xx*2*PETSC_PI/thi->Lx - PETSC_PI,y = yy*2*PETSC_PI/thi->Ly - PETSC_PI; /* [-pi,pi] */
324   PetscReal r     = PetscSqrtReal(x*x + y*y),s = -x*PetscSinReal(thi->alpha);
325 
326   p->b = s - 1000*units->meter + 500*units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
327   if (PetscRealPart(p->b) > -700*units->meter) p->b += 200*units->meter;
328   p->h     = s - p->b;
329   p->beta2 = 1000 * (1. + PetscSinReal(PetscSqrtReal(16*r))/PetscSqrtReal(1e-2 + 16*r)*PetscCosReal(x*3/2)*PetscCosReal(y*3/2)) * units->Pascal * units->year / units->meter;
330 }
331 
332 /* Same bed as A, smoothly varying slipperiness, similar to MATLAB's "sombrero" (uncorrelated with bathymetry) */
333 static void THIInitialize_HOM_Z(THI thi,PetscReal xx,PetscReal yy,PrmNode *p)
334 {
335   Units     units = thi->units;
336   PetscReal x     = xx*2*PETSC_PI/thi->Lx - PETSC_PI,y = yy*2*PETSC_PI/thi->Ly - PETSC_PI; /* [-pi,pi] */
337   PetscReal r     = PetscSqrtReal(x*x + y*y),s = -x*PetscSinReal(thi->alpha);
338 
339   p->b     = s - 1000*units->meter + 500*units->meter * PetscSinReal(x + PETSC_PI) * PetscSinReal(y + PETSC_PI);
340   p->h     = s - p->b;
341   p->beta2 = 1000 * (1. + PetscSinReal(PetscSqrtReal(16*r))/PetscSqrtReal(1e-2 + 16*r)*PetscCosReal(x*3/2)*PetscCosReal(y*3/2)) * units->Pascal * units->year / units->meter;
342 }
343 
344 static void THIFriction(THI thi,PetscReal rbeta2,PetscReal gam,PetscReal *beta2,PetscReal *dbeta2)
345 {
346   if (thi->friction.irefgam == 0) {
347     Units units = thi->units;
348     thi->friction.irefgam = 1./(0.5*PetscSqr(thi->friction.refvel * units->meter / units->year));
349     thi->friction.eps2    = 0.5*PetscSqr(thi->friction.epsvel * units->meter / units->year) * thi->friction.irefgam;
350   }
351   if (thi->friction.exponent == 0) {
352     *beta2  = rbeta2;
353     *dbeta2 = 0;
354   } else {
355     *beta2  = rbeta2 * PetscPowReal(thi->friction.eps2 + gam*thi->friction.irefgam,thi->friction.exponent);
356     *dbeta2 = thi->friction.exponent * *beta2 / (thi->friction.eps2 + gam*thi->friction.irefgam) * thi->friction.irefgam;
357   }
358 }
359 
360 static void THIViscosity(THI thi,PetscReal gam,PetscReal *eta,PetscReal *deta)
361 {
362   PetscReal Bd2,eps,exponent;
363   if (thi->viscosity.Bd2 == 0) {
364     Units units = thi->units;
365     const PetscReal
366       n = 3.,                                           /* Glen exponent */
367       p = 1. + 1./n,                                    /* for Stokes */
368       A = 1.e-16 * PetscPowReal(units->Pascal,-n) / units->year, /* softness parameter (Pa^{-n}/s) */
369       B = PetscPowReal(A,-1./n);                                 /* hardness parameter */
370     thi->viscosity.Bd2      = B/2;
371     thi->viscosity.exponent = (p-2)/2;
372     thi->viscosity.eps      = 0.5*PetscSqr(1e-5 / units->year);
373   }
374   Bd2      = thi->viscosity.Bd2;
375   exponent = thi->viscosity.exponent;
376   eps      = thi->viscosity.eps;
377   *eta     = Bd2 * PetscPowReal(eps + gam,exponent);
378   *deta    = exponent * (*eta) / (eps + gam);
379 }
380 
381 static void RangeUpdate(PetscReal *min,PetscReal *max,PetscReal x)
382 {
383   if (x < *min) *min = x;
384   if (x > *max) *max = x;
385 }
386 
387 static void PRangeClear(PRange *p)
388 {
389   p->cmin = p->min = 1e100;
390   p->cmax = p->max = -1e100;
391 }
392 
393 static PetscErrorCode PRangeMinMax(PRange *p,PetscReal min,PetscReal max)
394 {
395   PetscFunctionBeginUser;
396   p->cmin = min;
397   p->cmax = max;
398   if (min < p->min) p->min = min;
399   if (max > p->max) p->max = max;
400   PetscFunctionReturn(0);
401 }
402 
403 static PetscErrorCode THIDestroy(THI *thi)
404 {
405   PetscFunctionBeginUser;
406   if (!*thi) PetscFunctionReturn(0);
407   if (--((PetscObject)(*thi))->refct > 0) {*thi = 0; PetscFunctionReturn(0);}
408   PetscCall(PetscFree((*thi)->units));
409   PetscCall(PetscFree((*thi)->mattype));
410   PetscCall(PetscHeaderDestroy(thi));
411   PetscFunctionReturn(0);
412 }
413 
414 static PetscErrorCode THICreate(MPI_Comm comm,THI *inthi)
415 {
416   static PetscBool registered = PETSC_FALSE;
417   THI              thi;
418   Units            units;
419 
420   PetscFunctionBeginUser;
421   *inthi = 0;
422   if (!registered) {
423     PetscCall(PetscClassIdRegister("Toy Hydrostatic Ice",&THI_CLASSID));
424     registered = PETSC_TRUE;
425   }
426   PetscCall(PetscHeaderCreate(thi,THI_CLASSID,"THI","Toy Hydrostatic Ice","",comm,THIDestroy,0));
427 
428   PetscCall(PetscNew(&thi->units));
429   units           = thi->units;
430   units->meter    = 1e-2;
431   units->second   = 1e-7;
432   units->kilogram = 1e-12;
433 
434   PetscOptionsBegin(comm,NULL,"Scaled units options","");
435   {
436     PetscCall(PetscOptionsReal("-units_meter","1 meter in scaled length units","",units->meter,&units->meter,NULL));
437     PetscCall(PetscOptionsReal("-units_second","1 second in scaled time units","",units->second,&units->second,NULL));
438     PetscCall(PetscOptionsReal("-units_kilogram","1 kilogram in scaled mass units","",units->kilogram,&units->kilogram,NULL));
439   }
440   PetscOptionsEnd();
441   units->Pascal = units->kilogram / (units->meter * PetscSqr(units->second));
442   units->year   = 31556926. * units->second; /* seconds per year */
443 
444   thi->Lx              = 10.e3;
445   thi->Ly              = 10.e3;
446   thi->Lz              = 1000;
447   thi->dirichlet_scale = 1;
448   thi->verbose         = PETSC_FALSE;
449 
450   PetscOptionsBegin(comm,NULL,"Toy Hydrostatic Ice options","");
451   {
452     QuadratureType quad       = QUAD_GAUSS;
453     char           homexp[]   = "A";
454     char           mtype[256] = MATSBAIJ;
455     PetscReal      L,m = 1.0;
456     PetscBool      flg;
457     L    = thi->Lx;
458     PetscCall(PetscOptionsReal("-thi_L","Domain size (m)","",L,&L,&flg));
459     if (flg) thi->Lx = thi->Ly = L;
460     PetscCall(PetscOptionsReal("-thi_Lx","X Domain size (m)","",thi->Lx,&thi->Lx,NULL));
461     PetscCall(PetscOptionsReal("-thi_Ly","Y Domain size (m)","",thi->Ly,&thi->Ly,NULL));
462     PetscCall(PetscOptionsReal("-thi_Lz","Z Domain size (m)","",thi->Lz,&thi->Lz,NULL));
463     PetscCall(PetscOptionsString("-thi_hom","ISMIP-HOM experiment (A or C)","",homexp,homexp,sizeof(homexp),NULL));
464     switch (homexp[0] = toupper(homexp[0])) {
465     case 'A':
466       thi->initialize = THIInitialize_HOM_A;
467       thi->no_slip    = PETSC_TRUE;
468       thi->alpha      = 0.5;
469       break;
470     case 'C':
471       thi->initialize = THIInitialize_HOM_C;
472       thi->no_slip    = PETSC_FALSE;
473       thi->alpha      = 0.1;
474       break;
475     case 'X':
476       thi->initialize = THIInitialize_HOM_X;
477       thi->no_slip    = PETSC_FALSE;
478       thi->alpha      = 0.3;
479       break;
480     case 'Y':
481       thi->initialize = THIInitialize_HOM_Y;
482       thi->no_slip    = PETSC_FALSE;
483       thi->alpha      = 0.5;
484       break;
485     case 'Z':
486       thi->initialize = THIInitialize_HOM_Z;
487       thi->no_slip    = PETSC_FALSE;
488       thi->alpha      = 0.5;
489       break;
490     default:
491       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"HOM experiment '%c' not implemented",homexp[0]);
492     }
493     PetscCall(PetscOptionsEnum("-thi_quadrature","Quadrature to use for 3D elements","",QuadratureTypes,(PetscEnum)quad,(PetscEnum*)&quad,NULL));
494     switch (quad) {
495     case QUAD_GAUSS:
496       HexQInterp = HexQInterp_Gauss;
497       HexQDeriv  = HexQDeriv_Gauss;
498       break;
499     case QUAD_LOBATTO:
500       HexQInterp = HexQInterp_Lobatto;
501       HexQDeriv  = HexQDeriv_Lobatto;
502       break;
503     }
504     PetscCall(PetscOptionsReal("-thi_alpha","Bed angle (degrees)","",thi->alpha,&thi->alpha,NULL));
505 
506     thi->friction.refvel = 100.;
507     thi->friction.epsvel = 1.;
508 
509     PetscCall(PetscOptionsReal("-thi_friction_refvel","Reference velocity for sliding","",thi->friction.refvel,&thi->friction.refvel,NULL));
510     PetscCall(PetscOptionsReal("-thi_friction_epsvel","Regularization velocity for sliding","",thi->friction.epsvel,&thi->friction.epsvel,NULL));
511     PetscCall(PetscOptionsReal("-thi_friction_m","Friction exponent, 0=Coulomb, 1=Navier","",m,&m,NULL));
512 
513     thi->friction.exponent = (m-1)/2;
514 
515     PetscCall(PetscOptionsReal("-thi_dirichlet_scale","Scale Dirichlet boundary conditions by this factor","",thi->dirichlet_scale,&thi->dirichlet_scale,NULL));
516     PetscCall(PetscOptionsReal("-thi_ssa_friction_scale","Scale slip boundary conditions by this factor in SSA (2D) assembly","",thi->ssa_friction_scale,&thi->ssa_friction_scale,NULL));
517     PetscCall(PetscOptionsBool("-thi_coarse2d","Use a 2D coarse space corresponding to SSA","",thi->coarse2d,&thi->coarse2d,NULL));
518     PetscCall(PetscOptionsBool("-thi_tridiagonal","Assemble a tridiagonal system (column coupling only) on the finest level","",thi->tridiagonal,&thi->tridiagonal,NULL));
519     PetscCall(PetscOptionsFList("-thi_mat_type","Matrix type","MatSetType",MatList,mtype,(char*)mtype,sizeof(mtype),NULL));
520     PetscCall(PetscStrallocpy(mtype,(char**)&thi->mattype));
521     PetscCall(PetscOptionsBool("-thi_verbose","Enable verbose output (like matrix sizes and statistics)","",thi->verbose,&thi->verbose,NULL));
522   }
523   PetscOptionsEnd();
524 
525   /* dimensionalize */
526   thi->Lx    *= units->meter;
527   thi->Ly    *= units->meter;
528   thi->Lz    *= units->meter;
529   thi->alpha *= PETSC_PI / 180;
530 
531   PRangeClear(&thi->eta);
532   PRangeClear(&thi->beta2);
533 
534   {
535     PetscReal u       = 1000*units->meter/(3e7*units->second),
536               gradu   = u / (100*units->meter),eta,deta,
537               rho     = 910 * units->kilogram/PetscPowReal(units->meter,3),
538               grav    = 9.81 * units->meter/PetscSqr(units->second),
539               driving = rho * grav * PetscSinReal(thi->alpha) * 1000*units->meter;
540     THIViscosity(thi,0.5*gradu*gradu,&eta,&deta);
541     thi->rhog = rho * grav;
542     if (thi->verbose) {
543       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Units: meter %8.2g  second %8.2g  kg %8.2g  Pa %8.2g\n",(double)units->meter,(double)units->second,(double)units->kilogram,(double)units->Pascal));
544       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Domain (%6.2g,%6.2g,%6.2g), pressure %8.2g, driving stress %8.2g\n",(double)thi->Lx,(double)thi->Ly,(double)thi->Lz,(double)(rho*grav*1e3*units->meter),(double)driving));
545       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Large velocity 1km/a %8.2g, velocity gradient %8.2g, eta %8.2g, stress %8.2g, ratio %8.2g\n",(double)u,(double)gradu,(double)eta,(double)(2*eta*gradu),(double)(2*eta*gradu/driving)));
546       THIViscosity(thi,0.5*PetscSqr(1e-3*gradu),&eta,&deta);
547       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Small velocity 1m/a  %8.2g, velocity gradient %8.2g, eta %8.2g, stress %8.2g, ratio %8.2g\n",(double)(1e-3*u),(double)(1e-3*gradu),(double)eta,(double)(2*eta*1e-3*gradu),(double)(2*eta*1e-3*gradu/driving)));
548     }
549   }
550 
551   *inthi = thi;
552   PetscFunctionReturn(0);
553 }
554 
555 static PetscErrorCode THIInitializePrm(THI thi,DM da2prm,Vec prm)
556 {
557   PrmNode        **p;
558   PetscInt       i,j,xs,xm,ys,ym,mx,my;
559 
560   PetscFunctionBeginUser;
561   PetscCall(DMDAGetGhostCorners(da2prm,&ys,&xs,0,&ym,&xm,0));
562   PetscCall(DMDAGetInfo(da2prm,0, &my,&mx,0, 0,0,0, 0,0,0,0,0,0));
563   PetscCall(DMDAVecGetArray(da2prm,prm,&p));
564   for (i=xs; i<xs+xm; i++) {
565     for (j=ys; j<ys+ym; j++) {
566       PetscReal xx = thi->Lx*i/mx,yy = thi->Ly*j/my;
567       thi->initialize(thi,xx,yy,&p[i][j]);
568     }
569   }
570   PetscCall(DMDAVecRestoreArray(da2prm,prm,&p));
571   PetscFunctionReturn(0);
572 }
573 
574 static PetscErrorCode THISetUpDM(THI thi,DM dm)
575 {
576   PetscInt        refinelevel,coarsenlevel,level,dim,Mx,My,Mz,mx,my,s;
577   DMDAStencilType st;
578   DM              da2prm;
579   Vec             X;
580 
581   PetscFunctionBeginUser;
582   PetscCall(DMDAGetInfo(dm,&dim, &Mz,&My,&Mx, 0,&my,&mx, 0,&s,0,0,0,&st));
583   if (dim == 2) {
584     PetscCall(DMDAGetInfo(dm,&dim, &My,&Mx,0, &my,&mx,0, 0,&s,0,0,0,&st));
585   }
586   PetscCall(DMGetRefineLevel(dm,&refinelevel));
587   PetscCall(DMGetCoarsenLevel(dm,&coarsenlevel));
588   level = refinelevel - coarsenlevel;
589   PetscCall(DMDACreate2d(PetscObjectComm((PetscObject)thi),DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC,st,My,Mx,my,mx,sizeof(PrmNode)/sizeof(PetscScalar),s,0,0,&da2prm));
590   PetscCall(DMSetUp(da2prm));
591   PetscCall(DMCreateLocalVector(da2prm,&X));
592   {
593     PetscReal Lx = thi->Lx / thi->units->meter,Ly = thi->Ly / thi->units->meter,Lz = thi->Lz / thi->units->meter;
594     if (dim == 2) {
595       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Level %" PetscInt_FMT " domain size (m) %8.2g x %8.2g, num elements %" PetscInt_FMT " x %" PetscInt_FMT " (%" PetscInt_FMT "), size (m) %g x %g\n",level,(double)Lx,(double)Ly,Mx,My,Mx*My,(double)(Lx/Mx),(double)(Ly/My)));
596     } else {
597       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)thi),"Level %" PetscInt_FMT " domain size (m) %8.2g x %8.2g x %8.2g, num elements %" PetscInt_FMT " x %" PetscInt_FMT " x %" PetscInt_FMT " (%" PetscInt_FMT "), size (m) %g x %g x %g\n",level,(double)Lx,(double)Ly,(double)Lz,Mx,My,Mz,Mx*My*Mz,(double)(Lx/Mx),(double)(Ly/My),(double)(1000./(Mz-1))));
598     }
599   }
600   PetscCall(THIInitializePrm(thi,da2prm,X));
601   if (thi->tridiagonal) {       /* Reset coarse Jacobian evaluation */
602     PetscCall(DMDASNESSetJacobianLocal(dm,(DMDASNESJacobian)THIJacobianLocal_3D_Full,thi));
603   }
604   if (thi->coarse2d) PetscCall(DMDASNESSetJacobianLocal(dm,(DMDASNESJacobian)THIJacobianLocal_2D,thi));
605   PetscCall(PetscObjectCompose((PetscObject)dm,"DMDA2Prm",(PetscObject)da2prm));
606   PetscCall(PetscObjectCompose((PetscObject)dm,"DMDA2Prm_Vec",(PetscObject)X));
607   PetscCall(DMDestroy(&da2prm));
608   PetscCall(VecDestroy(&X));
609   PetscFunctionReturn(0);
610 }
611 
612 static PetscErrorCode DMCoarsenHook_THI(DM dmf,DM dmc,void *ctx)
613 {
614   THI            thi = (THI)ctx;
615   PetscInt       rlevel,clevel;
616 
617   PetscFunctionBeginUser;
618   PetscCall(THISetUpDM(thi,dmc));
619   PetscCall(DMGetRefineLevel(dmc,&rlevel));
620   PetscCall(DMGetCoarsenLevel(dmc,&clevel));
621   if (rlevel-clevel == 0) PetscCall(DMSetMatType(dmc,MATAIJ));
622   PetscCall(DMCoarsenHookAdd(dmc,DMCoarsenHook_THI,NULL,thi));
623   PetscFunctionReturn(0);
624 }
625 
626 static PetscErrorCode DMRefineHook_THI(DM dmc,DM dmf,void *ctx)
627 {
628   THI            thi = (THI)ctx;
629 
630   PetscFunctionBeginUser;
631   PetscCall(THISetUpDM(thi,dmf));
632   PetscCall(DMSetMatType(dmf,thi->mattype));
633   PetscCall(DMRefineHookAdd(dmf,DMRefineHook_THI,NULL,thi));
634   /* With grid sequencing, a formerly-refined DM will later be coarsened by PCSetUp_MG */
635   PetscCall(DMCoarsenHookAdd(dmf,DMCoarsenHook_THI,NULL,thi));
636   PetscFunctionReturn(0);
637 }
638 
639 static PetscErrorCode THIDAGetPrm(DM da,PrmNode ***prm)
640 {
641   DM             da2prm;
642   Vec            X;
643 
644   PetscFunctionBeginUser;
645   PetscCall(PetscObjectQuery((PetscObject)da,"DMDA2Prm",(PetscObject*)&da2prm));
646   PetscCheck(da2prm,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm composed with given DMDA");
647   PetscCall(PetscObjectQuery((PetscObject)da,"DMDA2Prm_Vec",(PetscObject*)&X));
648   PetscCheck(X,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm_Vec composed with given DMDA");
649   PetscCall(DMDAVecGetArray(da2prm,X,prm));
650   PetscFunctionReturn(0);
651 }
652 
653 static PetscErrorCode THIDARestorePrm(DM da,PrmNode ***prm)
654 {
655   DM             da2prm;
656   Vec            X;
657 
658   PetscFunctionBeginUser;
659   PetscCall(PetscObjectQuery((PetscObject)da,"DMDA2Prm",(PetscObject*)&da2prm));
660   PetscCheck(da2prm,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm composed with given DMDA");
661   PetscCall(PetscObjectQuery((PetscObject)da,"DMDA2Prm_Vec",(PetscObject*)&X));
662   PetscCheck(X,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"No DMDA2Prm_Vec composed with given DMDA");
663   PetscCall(DMDAVecRestoreArray(da2prm,X,prm));
664   PetscFunctionReturn(0);
665 }
666 
667 static PetscErrorCode THIInitial(SNES snes,Vec X,void *ctx)
668 {
669   THI            thi;
670   PetscInt       i,j,k,xs,xm,ys,ym,zs,zm,mx,my;
671   PetscReal      hx,hy;
672   PrmNode        **prm;
673   Node           ***x;
674   DM             da;
675 
676   PetscFunctionBeginUser;
677   PetscCall(SNESGetDM(snes,&da));
678   PetscCall(DMGetApplicationContext(da,&thi));
679   PetscCall(DMDAGetInfo(da,0, 0,&my,&mx, 0,0,0, 0,0,0,0,0,0));
680   PetscCall(DMDAGetCorners(da,&zs,&ys,&xs,&zm,&ym,&xm));
681   PetscCall(DMDAVecGetArray(da,X,&x));
682   PetscCall(THIDAGetPrm(da,&prm));
683   hx   = thi->Lx / mx;
684   hy   = thi->Ly / my;
685   for (i=xs; i<xs+xm; i++) {
686     for (j=ys; j<ys+ym; j++) {
687       for (k=zs; k<zs+zm; k++) {
688         const PetscScalar zm1      = zm-1,
689                           drivingx = thi->rhog * (prm[i+1][j].b+prm[i+1][j].h - prm[i-1][j].b-prm[i-1][j].h) / (2*hx),
690                           drivingy = thi->rhog * (prm[i][j+1].b+prm[i][j+1].h - prm[i][j-1].b-prm[i][j-1].h) / (2*hy);
691         x[i][j][k].u = 0. * drivingx * prm[i][j].h*(PetscScalar)k/zm1;
692         x[i][j][k].v = 0. * drivingy * prm[i][j].h*(PetscScalar)k/zm1;
693       }
694     }
695   }
696   PetscCall(DMDAVecRestoreArray(da,X,&x));
697   PetscCall(THIDARestorePrm(da,&prm));
698   PetscFunctionReturn(0);
699 }
700 
701 static void PointwiseNonlinearity(THI thi,const Node n[PETSC_RESTRICT],const PetscReal phi[PETSC_RESTRICT],PetscReal dphi[PETSC_RESTRICT][3],PetscScalar *PETSC_RESTRICT u,PetscScalar *PETSC_RESTRICT v,PetscScalar du[PETSC_RESTRICT],PetscScalar dv[PETSC_RESTRICT],PetscReal *eta,PetscReal *deta)
702 {
703   PetscInt    l,ll;
704   PetscScalar gam;
705 
706   du[0] = du[1] = du[2] = 0;
707   dv[0] = dv[1] = dv[2] = 0;
708   *u    = 0;
709   *v    = 0;
710   for (l=0; l<8; l++) {
711     *u += phi[l] * n[l].u;
712     *v += phi[l] * n[l].v;
713     for (ll=0; ll<3; ll++) {
714       du[ll] += dphi[l][ll] * n[l].u;
715       dv[ll] += dphi[l][ll] * n[l].v;
716     }
717   }
718   gam = PetscSqr(du[0]) + PetscSqr(dv[1]) + du[0]*dv[1] + 0.25*PetscSqr(du[1]+dv[0]) + 0.25*PetscSqr(du[2]) + 0.25*PetscSqr(dv[2]);
719   THIViscosity(thi,PetscRealPart(gam),eta,deta);
720 }
721 
722 static void PointwiseNonlinearity2D(THI thi,Node n[],PetscReal phi[],PetscReal dphi[4][2],PetscScalar *u,PetscScalar *v,PetscScalar du[],PetscScalar dv[],PetscReal *eta,PetscReal *deta)
723 {
724   PetscInt    l,ll;
725   PetscScalar gam;
726 
727   du[0] = du[1] = 0;
728   dv[0] = dv[1] = 0;
729   *u    = 0;
730   *v    = 0;
731   for (l=0; l<4; l++) {
732     *u += phi[l] * n[l].u;
733     *v += phi[l] * n[l].v;
734     for (ll=0; ll<2; ll++) {
735       du[ll] += dphi[l][ll] * n[l].u;
736       dv[ll] += dphi[l][ll] * n[l].v;
737     }
738   }
739   gam = PetscSqr(du[0]) + PetscSqr(dv[1]) + du[0]*dv[1] + 0.25*PetscSqr(du[1]+dv[0]);
740   THIViscosity(thi,PetscRealPart(gam),eta,deta);
741 }
742 
743 static PetscErrorCode THIFunctionLocal(DMDALocalInfo *info,Node ***x,Node ***f,THI thi)
744 {
745   PetscInt       xs,ys,xm,ym,zm,i,j,k,q,l;
746   PetscReal      hx,hy,etamin,etamax,beta2min,beta2max;
747   PrmNode        **prm;
748 
749   PetscFunctionBeginUser;
750   xs = info->zs;
751   ys = info->ys;
752   xm = info->zm;
753   ym = info->ym;
754   zm = info->xm;
755   hx = thi->Lx / info->mz;
756   hy = thi->Ly / info->my;
757 
758   etamin   = 1e100;
759   etamax   = 0;
760   beta2min = 1e100;
761   beta2max = 0;
762 
763   PetscCall(THIDAGetPrm(info->da,&prm));
764 
765   for (i=xs; i<xs+xm; i++) {
766     for (j=ys; j<ys+ym; j++) {
767       PrmNode pn[4];
768       QuadExtract(prm,i,j,pn);
769       for (k=0; k<zm-1; k++) {
770         PetscInt  ls = 0;
771         Node      n[8],*fn[8];
772         PetscReal zn[8],etabase = 0;
773         PrmHexGetZ(pn,k,zm,zn);
774         HexExtract(x,i,j,k,n);
775         HexExtractRef(f,i,j,k,fn);
776         if (thi->no_slip && k == 0) {
777           for (l=0; l<4; l++) n[l].u = n[l].v = 0;
778           /* The first 4 basis functions lie on the bottom layer, so their contribution is exactly 0, hence we can skip them */
779           ls = 4;
780         }
781         for (q=0; q<8; q++) {
782           PetscReal   dz[3],phi[8],dphi[8][3],jw,eta,deta;
783           PetscScalar du[3],dv[3],u,v;
784           HexGrad(HexQDeriv[q],zn,dz);
785           HexComputeGeometry(q,hx,hy,dz,phi,dphi,&jw);
786           PointwiseNonlinearity(thi,n,phi,dphi,&u,&v,du,dv,&eta,&deta);
787           jw /= thi->rhog;      /* scales residuals to be O(1) */
788           if (q == 0) etabase = eta;
789           RangeUpdate(&etamin,&etamax,eta);
790           for (l=ls; l<8; l++) { /* test functions */
791             const PetscReal ds[2] = {-PetscSinReal(thi->alpha),0};
792             const PetscReal pp    = phi[l],*dp = dphi[l];
793             fn[l]->u += dp[0]*jw*eta*(4.*du[0]+2.*dv[1]) + dp[1]*jw*eta*(du[1]+dv[0]) + dp[2]*jw*eta*du[2] + pp*jw*thi->rhog*ds[0];
794             fn[l]->v += dp[1]*jw*eta*(2.*du[0]+4.*dv[1]) + dp[0]*jw*eta*(du[1]+dv[0]) + dp[2]*jw*eta*dv[2] + pp*jw*thi->rhog*ds[1];
795           }
796         }
797         if (k == 0) { /* we are on a bottom face */
798           if (thi->no_slip) {
799             /* Note: Non-Galerkin coarse grid operators are very sensitive to the scaling of Dirichlet boundary
800             * conditions.  After shenanigans above, etabase contains the effective viscosity at the closest quadrature
801             * point to the bed.  We want the diagonal entry in the Dirichlet condition to have similar magnitude to the
802             * diagonal entry corresponding to the adjacent node.  The fundamental scaling of the viscous part is in
803             * diagu, diagv below.  This scaling is easy to recognize by considering the finite difference operator after
804             * scaling by element size.  The no-slip Dirichlet condition is scaled by this factor, and also in the
805             * assembled matrix (see the similar block in THIJacobianLocal).
806             *
807             * Note that the residual at this Dirichlet node is linear in the state at this node, but also depends
808             * (nonlinearly in general) on the neighboring interior nodes through the local viscosity.  This will make
809             * a matrix-free Jacobian have extra entries in the corresponding row.  We assemble only the diagonal part,
810             * so the solution will exactly satisfy the boundary condition after the first linear iteration.
811             */
812             const PetscReal   hz    = PetscRealPart(pn[0].h)/(zm-1.);
813             const PetscScalar diagu = 2*etabase/thi->rhog*(hx*hy/hz + hx*hz/hy + 4*hy*hz/hx),diagv = 2*etabase/thi->rhog*(hx*hy/hz + 4*hx*hz/hy + hy*hz/hx);
814             fn[0]->u = thi->dirichlet_scale*diagu*x[i][j][k].u;
815             fn[0]->v = thi->dirichlet_scale*diagv*x[i][j][k].v;
816           } else {              /* Integrate over bottom face to apply boundary condition */
817             for (q=0; q<4; q++) {
818               const PetscReal jw = 0.25*hx*hy/thi->rhog,*phi = QuadQInterp[q];
819               PetscScalar     u  =0,v=0,rbeta2=0;
820               PetscReal       beta2,dbeta2;
821               for (l=0; l<4; l++) {
822                 u      += phi[l]*n[l].u;
823                 v      += phi[l]*n[l].v;
824                 rbeta2 += phi[l]*pn[l].beta2;
825               }
826               THIFriction(thi,PetscRealPart(rbeta2),PetscRealPart(u*u+v*v)/2,&beta2,&dbeta2);
827               RangeUpdate(&beta2min,&beta2max,beta2);
828               for (l=0; l<4; l++) {
829                 const PetscReal pp = phi[l];
830                 fn[ls+l]->u += pp*jw*beta2*u;
831                 fn[ls+l]->v += pp*jw*beta2*v;
832               }
833             }
834           }
835         }
836       }
837     }
838   }
839 
840   PetscCall(THIDARestorePrm(info->da,&prm));
841 
842   PetscCall(PRangeMinMax(&thi->eta,etamin,etamax));
843   PetscCall(PRangeMinMax(&thi->beta2,beta2min,beta2max));
844   PetscFunctionReturn(0);
845 }
846 
847 static PetscErrorCode THIMatrixStatistics(THI thi,Mat B,PetscViewer viewer)
848 {
849   PetscReal      nrm;
850   PetscInt       m;
851   PetscMPIInt    rank;
852 
853   PetscFunctionBeginUser;
854   PetscCall(MatNorm(B,NORM_FROBENIUS,&nrm));
855   PetscCall(MatGetSize(B,&m,0));
856   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)B),&rank));
857   if (rank == 0) {
858     PetscScalar val0,val2;
859     PetscCall(MatGetValue(B,0,0,&val0));
860     PetscCall(MatGetValue(B,2,2,&val2));
861     PetscCall(PetscViewerASCIIPrintf(viewer,"Matrix dim %" PetscInt_FMT " norm %8.2e (0,0) %8.2e  (2,2) %8.2e %8.2e <= eta <= %8.2e %8.2e <= beta2 <= %8.2e\n",m,(double)nrm,(double)PetscRealPart(val0),(double)PetscRealPart(val2),(double)thi->eta.cmin,(double)thi->eta.cmax,(double)thi->beta2.cmin,(double)thi->beta2.cmax));
862   }
863   PetscFunctionReturn(0);
864 }
865 
866 static PetscErrorCode THISurfaceStatistics(DM da,Vec X,PetscReal *min,PetscReal *max,PetscReal *mean)
867 {
868   Node           ***x;
869   PetscInt       i,j,xs,ys,zs,xm,ym,zm,mx,my,mz;
870   PetscReal      umin = 1e100,umax=-1e100;
871   PetscScalar    usum = 0.0,gusum;
872 
873   PetscFunctionBeginUser;
874   *min = *max = *mean = 0;
875   PetscCall(DMDAGetInfo(da,0, &mz,&my,&mx, 0,0,0, 0,0,0,0,0,0));
876   PetscCall(DMDAGetCorners(da,&zs,&ys,&xs,&zm,&ym,&xm));
877   PetscCheck(zs == 0 && zm == mz,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected decomposition");
878   PetscCall(DMDAVecGetArray(da,X,&x));
879   for (i=xs; i<xs+xm; i++) {
880     for (j=ys; j<ys+ym; j++) {
881       PetscReal u = PetscRealPart(x[i][j][zm-1].u);
882       RangeUpdate(&umin,&umax,u);
883       usum += u;
884     }
885   }
886   PetscCall(DMDAVecRestoreArray(da,X,&x));
887   PetscCallMPI(MPI_Allreduce(&umin,min,1,MPIU_REAL,MPIU_MIN,PetscObjectComm((PetscObject)da)));
888   PetscCallMPI(MPI_Allreduce(&umax,max,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)da)));
889   PetscCallMPI(MPI_Allreduce(&usum,&gusum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)da)));
890   *mean = PetscRealPart(gusum) / (mx*my);
891   PetscFunctionReturn(0);
892 }
893 
894 static PetscErrorCode THISolveStatistics(THI thi,SNES snes,PetscInt coarsened,const char name[])
895 {
896   MPI_Comm       comm;
897   Vec            X;
898   DM             dm;
899 
900   PetscFunctionBeginUser;
901   PetscCall(PetscObjectGetComm((PetscObject)thi,&comm));
902   PetscCall(SNESGetSolution(snes,&X));
903   PetscCall(SNESGetDM(snes,&dm));
904   PetscCall(PetscPrintf(comm,"Solution statistics after solve: %s\n",name));
905   {
906     PetscInt            its,lits;
907     SNESConvergedReason reason;
908     PetscCall(SNESGetIterationNumber(snes,&its));
909     PetscCall(SNESGetConvergedReason(snes,&reason));
910     PetscCall(SNESGetLinearSolveIterations(snes,&lits));
911     PetscCall(PetscPrintf(comm,"%s: Number of SNES iterations = %" PetscInt_FMT ", total linear iterations = %" PetscInt_FMT "\n",SNESConvergedReasons[reason],its,lits));
912   }
913   {
914     PetscReal         nrm2,tmin[3]={1e100,1e100,1e100},tmax[3]={-1e100,-1e100,-1e100},min[3],max[3];
915     PetscInt          i,j,m;
916     const PetscScalar *x;
917     PetscCall(VecNorm(X,NORM_2,&nrm2));
918     PetscCall(VecGetLocalSize(X,&m));
919     PetscCall(VecGetArrayRead(X,&x));
920     for (i=0; i<m; i+=2) {
921       PetscReal u = PetscRealPart(x[i]),v = PetscRealPart(x[i+1]),c = PetscSqrtReal(u*u+v*v);
922       tmin[0] = PetscMin(u,tmin[0]);
923       tmin[1] = PetscMin(v,tmin[1]);
924       tmin[2] = PetscMin(c,tmin[2]);
925       tmax[0] = PetscMax(u,tmax[0]);
926       tmax[1] = PetscMax(v,tmax[1]);
927       tmax[2] = PetscMax(c,tmax[2]);
928     }
929     PetscCall(VecRestoreArrayRead(X,&x));
930     PetscCallMPI(MPI_Allreduce(tmin,min,3,MPIU_REAL,MPIU_MIN,PetscObjectComm((PetscObject)thi)));
931     PetscCallMPI(MPI_Allreduce(tmax,max,3,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)thi)));
932     /* Dimensionalize to meters/year */
933     nrm2 *= thi->units->year / thi->units->meter;
934     for (j=0; j<3; j++) {
935       min[j] *= thi->units->year / thi->units->meter;
936       max[j] *= thi->units->year / thi->units->meter;
937     }
938     if (min[0] == 0.0) min[0] = 0.0;
939     PetscCall(PetscPrintf(comm,"|X|_2 %g   %g <= u <=  %g   %g <= v <=  %g   %g <= c <=  %g \n",(double)nrm2,(double)min[0],(double)max[0],(double)min[1],(double)max[1],(double)min[2],(double)max[2]));
940     {
941       PetscReal umin,umax,umean;
942       PetscCall(THISurfaceStatistics(dm,X,&umin,&umax,&umean));
943       umin  *= thi->units->year / thi->units->meter;
944       umax  *= thi->units->year / thi->units->meter;
945       umean *= thi->units->year / thi->units->meter;
946       PetscCall(PetscPrintf(comm,"Surface statistics: u in [%12.6e, %12.6e] mean %12.6e\n",(double)umin,(double)umax,(double)umean));
947     }
948     /* These values stay nondimensional */
949     PetscCall(PetscPrintf(comm,"Global eta range   %g to %g converged range %g to %g\n",(double)thi->eta.min,(double)thi->eta.max,(double)thi->eta.cmin,(double)thi->eta.cmax));
950     PetscCall(PetscPrintf(comm,"Global beta2 range %g to %g converged range %g to %g\n",(double)thi->beta2.min,(double)thi->beta2.max,(double)thi->beta2.cmin,(double)thi->beta2.cmax));
951   }
952   PetscFunctionReturn(0);
953 }
954 
955 static PetscErrorCode THIJacobianLocal_2D(DMDALocalInfo *info,Node **x,Mat J,Mat B,THI thi)
956 {
957   PetscInt       xs,ys,xm,ym,i,j,q,l,ll;
958   PetscReal      hx,hy;
959   PrmNode        **prm;
960 
961   PetscFunctionBeginUser;
962   xs = info->ys;
963   ys = info->xs;
964   xm = info->ym;
965   ym = info->xm;
966   hx = thi->Lx / info->my;
967   hy = thi->Ly / info->mx;
968 
969   PetscCall(MatZeroEntries(B));
970   PetscCall(THIDAGetPrm(info->da,&prm));
971 
972   for (i=xs; i<xs+xm; i++) {
973     for (j=ys; j<ys+ym; j++) {
974       Node        n[4];
975       PrmNode     pn[4];
976       PetscScalar Ke[4*2][4*2];
977       QuadExtract(prm,i,j,pn);
978       QuadExtract(x,i,j,n);
979       PetscCall(PetscMemzero(Ke,sizeof(Ke)));
980       for (q=0; q<4; q++) {
981         PetscReal   phi[4],dphi[4][2],jw,eta,deta,beta2,dbeta2;
982         PetscScalar u,v,du[2],dv[2],h = 0,rbeta2 = 0;
983         for (l=0; l<4; l++) {
984           phi[l]     = QuadQInterp[q][l];
985           dphi[l][0] = QuadQDeriv[q][l][0]*2./hx;
986           dphi[l][1] = QuadQDeriv[q][l][1]*2./hy;
987           h         += phi[l] * pn[l].h;
988           rbeta2    += phi[l] * pn[l].beta2;
989         }
990         jw = 0.25*hx*hy / thi->rhog; /* rhog is only scaling */
991         PointwiseNonlinearity2D(thi,n,phi,dphi,&u,&v,du,dv,&eta,&deta);
992         THIFriction(thi,PetscRealPart(rbeta2),PetscRealPart(u*u+v*v)/2,&beta2,&dbeta2);
993         for (l=0; l<4; l++) {
994           const PetscReal pp = phi[l],*dp = dphi[l];
995           for (ll=0; ll<4; ll++) {
996             const PetscReal ppl = phi[ll],*dpl = dphi[ll];
997             PetscScalar     dgdu,dgdv;
998             dgdu = 2.*du[0]*dpl[0] + dv[1]*dpl[0] + 0.5*(du[1]+dv[0])*dpl[1];
999             dgdv = 2.*dv[1]*dpl[1] + du[0]*dpl[1] + 0.5*(du[1]+dv[0])*dpl[0];
1000             /* Picard part */
1001             Ke[l*2+0][ll*2+0] += dp[0]*jw*eta*4.*dpl[0] + dp[1]*jw*eta*dpl[1] + pp*jw*(beta2/h)*ppl*thi->ssa_friction_scale;
1002             Ke[l*2+0][ll*2+1] += dp[0]*jw*eta*2.*dpl[1] + dp[1]*jw*eta*dpl[0];
1003             Ke[l*2+1][ll*2+0] += dp[1]*jw*eta*2.*dpl[0] + dp[0]*jw*eta*dpl[1];
1004             Ke[l*2+1][ll*2+1] += dp[1]*jw*eta*4.*dpl[1] + dp[0]*jw*eta*dpl[0] + pp*jw*(beta2/h)*ppl*thi->ssa_friction_scale;
1005             /* extra Newton terms */
1006             Ke[l*2+0][ll*2+0] += dp[0]*jw*deta*dgdu*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdu*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*u*u*ppl*thi->ssa_friction_scale;
1007             Ke[l*2+0][ll*2+1] += dp[0]*jw*deta*dgdv*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdv*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*u*v*ppl*thi->ssa_friction_scale;
1008             Ke[l*2+1][ll*2+0] += dp[1]*jw*deta*dgdu*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdu*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*v*u*ppl*thi->ssa_friction_scale;
1009             Ke[l*2+1][ll*2+1] += dp[1]*jw*deta*dgdv*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdv*(du[1]+dv[0]) + pp*jw*(dbeta2/h)*v*v*ppl*thi->ssa_friction_scale;
1010           }
1011         }
1012       }
1013       {
1014         const MatStencil rc[4] = {{0,i,j,0},{0,i+1,j,0},{0,i+1,j+1,0},{0,i,j+1,0}};
1015         PetscCall(MatSetValuesBlockedStencil(B,4,rc,4,rc,&Ke[0][0],ADD_VALUES));
1016       }
1017     }
1018   }
1019   PetscCall(THIDARestorePrm(info->da,&prm));
1020 
1021   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1022   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1023   PetscCall(MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE));
1024   if (thi->verbose) PetscCall(THIMatrixStatistics(thi,B,PETSC_VIEWER_STDOUT_WORLD));
1025   PetscFunctionReturn(0);
1026 }
1027 
1028 static PetscErrorCode THIJacobianLocal_3D(DMDALocalInfo *info,Node ***x,Mat B,THI thi,THIAssemblyMode amode)
1029 {
1030   PetscInt       xs,ys,xm,ym,zm,i,j,k,q,l,ll;
1031   PetscReal      hx,hy;
1032   PrmNode        **prm;
1033 
1034   PetscFunctionBeginUser;
1035   xs = info->zs;
1036   ys = info->ys;
1037   xm = info->zm;
1038   ym = info->ym;
1039   zm = info->xm;
1040   hx = thi->Lx / info->mz;
1041   hy = thi->Ly / info->my;
1042 
1043   PetscCall(MatZeroEntries(B));
1044   PetscCall(MatSetOption(B,MAT_SUBSET_OFF_PROC_ENTRIES,PETSC_TRUE));
1045   PetscCall(THIDAGetPrm(info->da,&prm));
1046 
1047   for (i=xs; i<xs+xm; i++) {
1048     for (j=ys; j<ys+ym; j++) {
1049       PrmNode pn[4];
1050       QuadExtract(prm,i,j,pn);
1051       for (k=0; k<zm-1; k++) {
1052         Node        n[8];
1053         PetscReal   zn[8],etabase = 0;
1054         PetscScalar Ke[8*2][8*2];
1055         PetscInt    ls = 0;
1056 
1057         PrmHexGetZ(pn,k,zm,zn);
1058         HexExtract(x,i,j,k,n);
1059         PetscCall(PetscMemzero(Ke,sizeof(Ke)));
1060         if (thi->no_slip && k == 0) {
1061           for (l=0; l<4; l++) n[l].u = n[l].v = 0;
1062           ls = 4;
1063         }
1064         for (q=0; q<8; q++) {
1065           PetscReal   dz[3],phi[8],dphi[8][3],jw,eta,deta;
1066           PetscScalar du[3],dv[3],u,v;
1067           HexGrad(HexQDeriv[q],zn,dz);
1068           HexComputeGeometry(q,hx,hy,dz,phi,dphi,&jw);
1069           PointwiseNonlinearity(thi,n,phi,dphi,&u,&v,du,dv,&eta,&deta);
1070           jw /= thi->rhog;      /* residuals are scaled by this factor */
1071           if (q == 0) etabase = eta;
1072           for (l=ls; l<8; l++) { /* test functions */
1073             const PetscReal *PETSC_RESTRICT dp = dphi[l];
1074 #if USE_SSE2_KERNELS
1075             /* gcc (up to my 4.5 snapshot) is really bad at hoisting intrinsics so we do it manually */
1076             __m128d
1077               p4         = _mm_set1_pd(4),p2 = _mm_set1_pd(2),p05 = _mm_set1_pd(0.5),
1078               p42        = _mm_setr_pd(4,2),p24 = _mm_shuffle_pd(p42,p42,_MM_SHUFFLE2(0,1)),
1079               du0        = _mm_set1_pd(du[0]),du1 = _mm_set1_pd(du[1]),du2 = _mm_set1_pd(du[2]),
1080               dv0        = _mm_set1_pd(dv[0]),dv1 = _mm_set1_pd(dv[1]),dv2 = _mm_set1_pd(dv[2]),
1081               jweta      = _mm_set1_pd(jw*eta),jwdeta = _mm_set1_pd(jw*deta),
1082               dp0        = _mm_set1_pd(dp[0]),dp1 = _mm_set1_pd(dp[1]),dp2 = _mm_set1_pd(dp[2]),
1083               dp0jweta   = _mm_mul_pd(dp0,jweta),dp1jweta = _mm_mul_pd(dp1,jweta),dp2jweta = _mm_mul_pd(dp2,jweta),
1084               p4du0p2dv1 = _mm_add_pd(_mm_mul_pd(p4,du0),_mm_mul_pd(p2,dv1)), /* 4 du0 + 2 dv1 */
1085               p4dv1p2du0 = _mm_add_pd(_mm_mul_pd(p4,dv1),_mm_mul_pd(p2,du0)), /* 4 dv1 + 2 du0 */
1086               pdu2dv2    = _mm_unpacklo_pd(du2,dv2),                          /* [du2, dv2] */
1087               du1pdv0    = _mm_add_pd(du1,dv0),                               /* du1 + dv0 */
1088               t1         = _mm_mul_pd(dp0,p4du0p2dv1),                        /* dp0 (4 du0 + 2 dv1) */
1089               t2         = _mm_mul_pd(dp1,p4dv1p2du0);                        /* dp1 (4 dv1 + 2 du0) */
1090 
1091 #endif
1092 #if defined COMPUTE_LOWER_TRIANGULAR  /* The element matrices are always symmetric so computing the lower-triangular part is not necessary */
1093             for (ll=ls; ll<8; ll++) { /* trial functions */
1094 #else
1095             for (ll=l; ll<8; ll++) {
1096 #endif
1097               const PetscReal *PETSC_RESTRICT dpl = dphi[ll];
1098               if (amode == THIASSEMBLY_TRIDIAGONAL && (l-ll)%4) continue; /* these entries would not be inserted */
1099 #if !USE_SSE2_KERNELS
1100               /* The analytic Jacobian in nice, easy-to-read form */
1101               {
1102                 PetscScalar dgdu,dgdv;
1103                 dgdu = 2.*du[0]*dpl[0] + dv[1]*dpl[0] + 0.5*(du[1]+dv[0])*dpl[1] + 0.5*du[2]*dpl[2];
1104                 dgdv = 2.*dv[1]*dpl[1] + du[0]*dpl[1] + 0.5*(du[1]+dv[0])*dpl[0] + 0.5*dv[2]*dpl[2];
1105                 /* Picard part */
1106                 Ke[l*2+0][ll*2+0] += dp[0]*jw*eta*4.*dpl[0] + dp[1]*jw*eta*dpl[1] + dp[2]*jw*eta*dpl[2];
1107                 Ke[l*2+0][ll*2+1] += dp[0]*jw*eta*2.*dpl[1] + dp[1]*jw*eta*dpl[0];
1108                 Ke[l*2+1][ll*2+0] += dp[1]*jw*eta*2.*dpl[0] + dp[0]*jw*eta*dpl[1];
1109                 Ke[l*2+1][ll*2+1] += dp[1]*jw*eta*4.*dpl[1] + dp[0]*jw*eta*dpl[0] + dp[2]*jw*eta*dpl[2];
1110                 /* extra Newton terms */
1111                 Ke[l*2+0][ll*2+0] += dp[0]*jw*deta*dgdu*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdu*(du[1]+dv[0]) + dp[2]*jw*deta*dgdu*du[2];
1112                 Ke[l*2+0][ll*2+1] += dp[0]*jw*deta*dgdv*(4.*du[0]+2.*dv[1]) + dp[1]*jw*deta*dgdv*(du[1]+dv[0]) + dp[2]*jw*deta*dgdv*du[2];
1113                 Ke[l*2+1][ll*2+0] += dp[1]*jw*deta*dgdu*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdu*(du[1]+dv[0]) + dp[2]*jw*deta*dgdu*dv[2];
1114                 Ke[l*2+1][ll*2+1] += dp[1]*jw*deta*dgdv*(4.*dv[1]+2.*du[0]) + dp[0]*jw*deta*dgdv*(du[1]+dv[0]) + dp[2]*jw*deta*dgdv*dv[2];
1115               }
1116 #else
1117               /* This SSE2 code is an exact replica of above, but uses explicit packed instructions for some speed
1118               * benefit.  On my hardware, these intrinsics are almost twice as fast as above, reducing total assembly cost
1119               * by 25 to 30 percent. */
1120               {
1121                 __m128d
1122                   keu   = _mm_loadu_pd(&Ke[l*2+0][ll*2+0]),
1123                   kev   = _mm_loadu_pd(&Ke[l*2+1][ll*2+0]),
1124                   dpl01 = _mm_loadu_pd(&dpl[0]),dpl10 = _mm_shuffle_pd(dpl01,dpl01,_MM_SHUFFLE2(0,1)),dpl2 = _mm_set_sd(dpl[2]),
1125                   t0,t3,pdgduv;
1126                 keu = _mm_add_pd(keu,_mm_add_pd(_mm_mul_pd(_mm_mul_pd(dp0jweta,p42),dpl01),
1127                                                 _mm_add_pd(_mm_mul_pd(dp1jweta,dpl10),
1128                                                            _mm_mul_pd(dp2jweta,dpl2))));
1129                 kev = _mm_add_pd(kev,_mm_add_pd(_mm_mul_pd(_mm_mul_pd(dp1jweta,p24),dpl01),
1130                                                 _mm_add_pd(_mm_mul_pd(dp0jweta,dpl10),
1131                                                            _mm_mul_pd(dp2jweta,_mm_shuffle_pd(dpl2,dpl2,_MM_SHUFFLE2(0,1))))));
1132                 pdgduv = _mm_mul_pd(p05,_mm_add_pd(_mm_add_pd(_mm_mul_pd(p42,_mm_mul_pd(du0,dpl01)),
1133                                                               _mm_mul_pd(p24,_mm_mul_pd(dv1,dpl01))),
1134                                                    _mm_add_pd(_mm_mul_pd(du1pdv0,dpl10),
1135                                                               _mm_mul_pd(pdu2dv2,_mm_set1_pd(dpl[2]))))); /* [dgdu, dgdv] */
1136                 t0 = _mm_mul_pd(jwdeta,pdgduv);  /* jw deta [dgdu, dgdv] */
1137                 t3 = _mm_mul_pd(t0,du1pdv0);     /* t0 (du1 + dv0) */
1138                 _mm_storeu_pd(&Ke[l*2+0][ll*2+0],_mm_add_pd(keu,_mm_add_pd(_mm_mul_pd(t1,t0),
1139                                                                            _mm_add_pd(_mm_mul_pd(dp1,t3),
1140                                                                                       _mm_mul_pd(t0,_mm_mul_pd(dp2,du2))))));
1141                 _mm_storeu_pd(&Ke[l*2+1][ll*2+0],_mm_add_pd(kev,_mm_add_pd(_mm_mul_pd(t2,t0),
1142                                                                            _mm_add_pd(_mm_mul_pd(dp0,t3),
1143                                                                                       _mm_mul_pd(t0,_mm_mul_pd(dp2,dv2))))));
1144               }
1145 #endif
1146             }
1147           }
1148         }
1149         if (k == 0) { /* on a bottom face */
1150           if (thi->no_slip) {
1151             const PetscReal   hz    = PetscRealPart(pn[0].h)/(zm-1);
1152             const PetscScalar diagu = 2*etabase/thi->rhog*(hx*hy/hz + hx*hz/hy + 4*hy*hz/hx),diagv = 2*etabase/thi->rhog*(hx*hy/hz + 4*hx*hz/hy + hy*hz/hx);
1153             Ke[0][0] = thi->dirichlet_scale*diagu;
1154             Ke[1][1] = thi->dirichlet_scale*diagv;
1155           } else {
1156             for (q=0; q<4; q++) {
1157               const PetscReal jw = 0.25*hx*hy/thi->rhog,*phi = QuadQInterp[q];
1158               PetscScalar     u  =0,v=0,rbeta2=0;
1159               PetscReal       beta2,dbeta2;
1160               for (l=0; l<4; l++) {
1161                 u      += phi[l]*n[l].u;
1162                 v      += phi[l]*n[l].v;
1163                 rbeta2 += phi[l]*pn[l].beta2;
1164               }
1165               THIFriction(thi,PetscRealPart(rbeta2),PetscRealPart(u*u+v*v)/2,&beta2,&dbeta2);
1166               for (l=0; l<4; l++) {
1167                 const PetscReal pp = phi[l];
1168                 for (ll=0; ll<4; ll++) {
1169                   const PetscReal ppl = phi[ll];
1170                   Ke[l*2+0][ll*2+0] += pp*jw*beta2*ppl + pp*jw*dbeta2*u*u*ppl;
1171                   Ke[l*2+0][ll*2+1] +=                   pp*jw*dbeta2*u*v*ppl;
1172                   Ke[l*2+1][ll*2+0] +=                   pp*jw*dbeta2*v*u*ppl;
1173                   Ke[l*2+1][ll*2+1] += pp*jw*beta2*ppl + pp*jw*dbeta2*v*v*ppl;
1174                 }
1175               }
1176             }
1177           }
1178         }
1179         {
1180           const MatStencil rc[8] = {{i,j,k,0},{i+1,j,k,0},{i+1,j+1,k,0},{i,j+1,k,0},{i,j,k+1,0},{i+1,j,k+1,0},{i+1,j+1,k+1,0},{i,j+1,k+1,0}};
1181           if (amode == THIASSEMBLY_TRIDIAGONAL) {
1182             for (l=0; l<4; l++) { /* Copy out each of the blocks, discarding horizontal coupling */
1183               const PetscInt   l4     = l+4;
1184               const MatStencil rcl[2] = {{rc[l].k,rc[l].j,rc[l].i,0},{rc[l4].k,rc[l4].j,rc[l4].i,0}};
1185 #if defined COMPUTE_LOWER_TRIANGULAR
1186               const PetscScalar Kel[4][4] = {{Ke[2*l+0][2*l+0] ,Ke[2*l+0][2*l+1] ,Ke[2*l+0][2*l4+0] ,Ke[2*l+0][2*l4+1]},
1187                                              {Ke[2*l+1][2*l+0] ,Ke[2*l+1][2*l+1] ,Ke[2*l+1][2*l4+0] ,Ke[2*l+1][2*l4+1]},
1188                                              {Ke[2*l4+0][2*l+0],Ke[2*l4+0][2*l+1],Ke[2*l4+0][2*l4+0],Ke[2*l4+0][2*l4+1]},
1189                                              {Ke[2*l4+1][2*l+0],Ke[2*l4+1][2*l+1],Ke[2*l4+1][2*l4+0],Ke[2*l4+1][2*l4+1]}};
1190 #else
1191               /* Same as above except for the lower-left block */
1192               const PetscScalar Kel[4][4] = {{Ke[2*l+0][2*l+0] ,Ke[2*l+0][2*l+1] ,Ke[2*l+0][2*l4+0] ,Ke[2*l+0][2*l4+1]},
1193                                              {Ke[2*l+1][2*l+0] ,Ke[2*l+1][2*l+1] ,Ke[2*l+1][2*l4+0] ,Ke[2*l+1][2*l4+1]},
1194                                              {Ke[2*l+0][2*l4+0],Ke[2*l+1][2*l4+0],Ke[2*l4+0][2*l4+0],Ke[2*l4+0][2*l4+1]},
1195                                              {Ke[2*l+0][2*l4+1],Ke[2*l+1][2*l4+1],Ke[2*l4+1][2*l4+0],Ke[2*l4+1][2*l4+1]}};
1196 #endif
1197               PetscCall(MatSetValuesBlockedStencil(B,2,rcl,2,rcl,&Kel[0][0],ADD_VALUES));
1198             }
1199           } else {
1200 #if !defined COMPUTE_LOWER_TRIANGULAR /* fill in lower-triangular part, this is really cheap compared to computing the entries */
1201             for (l=0; l<8; l++) {
1202               for (ll=l+1; ll<8; ll++) {
1203                 Ke[ll*2+0][l*2+0] = Ke[l*2+0][ll*2+0];
1204                 Ke[ll*2+1][l*2+0] = Ke[l*2+0][ll*2+1];
1205                 Ke[ll*2+0][l*2+1] = Ke[l*2+1][ll*2+0];
1206                 Ke[ll*2+1][l*2+1] = Ke[l*2+1][ll*2+1];
1207               }
1208             }
1209 #endif
1210             PetscCall(MatSetValuesBlockedStencil(B,8,rc,8,rc,&Ke[0][0],ADD_VALUES));
1211           }
1212         }
1213       }
1214     }
1215   }
1216   PetscCall(THIDARestorePrm(info->da,&prm));
1217 
1218   PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1219   PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1220   PetscCall(MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE));
1221   if (thi->verbose) PetscCall(THIMatrixStatistics(thi,B,PETSC_VIEWER_STDOUT_WORLD));
1222   PetscFunctionReturn(0);
1223 }
1224 
1225 static PetscErrorCode THIJacobianLocal_3D_Full(DMDALocalInfo *info,Node ***x,Mat A,Mat B,THI thi)
1226 {
1227   PetscFunctionBeginUser;
1228   PetscCall(THIJacobianLocal_3D(info,x,B,thi,THIASSEMBLY_FULL));
1229   PetscFunctionReturn(0);
1230 }
1231 
1232 static PetscErrorCode THIJacobianLocal_3D_Tridiagonal(DMDALocalInfo *info,Node ***x,Mat A,Mat B,THI thi)
1233 {
1234   PetscFunctionBeginUser;
1235   PetscCall(THIJacobianLocal_3D(info,x,B,thi,THIASSEMBLY_TRIDIAGONAL));
1236   PetscFunctionReturn(0);
1237 }
1238 
1239 static PetscErrorCode DMRefineHierarchy_THI(DM dac0,PetscInt nlevels,DM hierarchy[])
1240 {
1241   THI             thi;
1242   PetscInt        dim,M,N,m,n,s,dof;
1243   DM              dac,daf;
1244   DMDAStencilType st;
1245   DM_DA           *ddf,*ddc;
1246 
1247   PetscFunctionBeginUser;
1248   PetscCall(PetscObjectQuery((PetscObject)dac0,"THI",(PetscObject*)&thi));
1249   PetscCheck(thi,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot refine this DMDA, missing composed THI instance");
1250   if (nlevels > 1) {
1251     PetscCall(DMRefineHierarchy(dac0,nlevels-1,hierarchy));
1252     dac  = hierarchy[nlevels-2];
1253   } else {
1254     dac = dac0;
1255   }
1256   PetscCall(DMDAGetInfo(dac,&dim, &N,&M,0, &n,&m,0, &dof,&s,0,0,0,&st));
1257   PetscCheck(dim == 2,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"This function can only refine 2D DMDAs");
1258 
1259   /* Creates a 3D DMDA with the same map-plane layout as the 2D one, with contiguous columns */
1260   PetscCall(DMDACreate3d(PetscObjectComm((PetscObject)dac),DM_BOUNDARY_NONE,DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC,st,thi->zlevels,N,M,1,n,m,dof,s,NULL,NULL,NULL,&daf));
1261   PetscCall(DMSetUp(daf));
1262 
1263   daf->ops->creatematrix        = dac->ops->creatematrix;
1264   daf->ops->createinterpolation = dac->ops->createinterpolation;
1265   daf->ops->getcoloring         = dac->ops->getcoloring;
1266   ddf                           = (DM_DA*)daf->data;
1267   ddc                           = (DM_DA*)dac->data;
1268   ddf->interptype               = ddc->interptype;
1269 
1270   PetscCall(DMDASetFieldName(daf,0,"x-velocity"));
1271   PetscCall(DMDASetFieldName(daf,1,"y-velocity"));
1272 
1273   hierarchy[nlevels-1] = daf;
1274   PetscFunctionReturn(0);
1275 }
1276 
1277 static PetscErrorCode DMCreateInterpolation_DA_THI(DM dac,DM daf,Mat *A,Vec *scale)
1278 {
1279   PetscInt       dim;
1280 
1281   PetscFunctionBeginUser;
1282   PetscValidHeaderSpecific(dac,DM_CLASSID,1);
1283   PetscValidHeaderSpecific(daf,DM_CLASSID,2);
1284   PetscValidPointer(A,3);
1285   if (scale) PetscValidPointer(scale,4);
1286   PetscCall(DMDAGetInfo(daf,&dim,0,0,0,0,0,0,0,0,0,0,0,0));
1287   if (dim  == 2) {
1288     /* We are in the 2D problem and use normal DMDA interpolation */
1289     PetscCall(DMCreateInterpolation(dac,daf,A,scale));
1290   } else {
1291     PetscInt i,j,k,xs,ys,zs,xm,ym,zm,mx,my,mz,rstart,cstart;
1292     Mat      B;
1293 
1294     PetscCall(DMDAGetInfo(daf,0, &mz,&my,&mx, 0,0,0, 0,0,0,0,0,0));
1295     PetscCall(DMDAGetCorners(daf,&zs,&ys,&xs,&zm,&ym,&xm));
1296     PetscCheck(!zs,PETSC_COMM_SELF,PETSC_ERR_PLIB,"unexpected");
1297     PetscCall(MatCreate(PetscObjectComm((PetscObject)daf),&B));
1298     PetscCall(MatSetSizes(B,xm*ym*zm,xm*ym,mx*my*mz,mx*my));
1299 
1300     PetscCall(MatSetType(B,MATAIJ));
1301     PetscCall(MatSeqAIJSetPreallocation(B,1,NULL));
1302     PetscCall(MatMPIAIJSetPreallocation(B,1,NULL,0,NULL));
1303     PetscCall(MatGetOwnershipRange(B,&rstart,NULL));
1304     PetscCall(MatGetOwnershipRangeColumn(B,&cstart,NULL));
1305     for (i=xs; i<xs+xm; i++) {
1306       for (j=ys; j<ys+ym; j++) {
1307         for (k=zs; k<zs+zm; k++) {
1308           PetscInt    i2  = i*ym+j,i3 = i2*zm+k;
1309           PetscScalar val = ((k == 0 || k == mz-1) ? 0.5 : 1.) / (mz-1.); /* Integration using trapezoid rule */
1310           PetscCall(MatSetValue(B,cstart+i3,rstart+i2,val,INSERT_VALUES));
1311         }
1312       }
1313     }
1314     PetscCall(MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY));
1315     PetscCall(MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY));
1316     PetscCall(MatCreateMAIJ(B,sizeof(Node)/sizeof(PetscScalar),A));
1317     PetscCall(MatDestroy(&B));
1318   }
1319   PetscFunctionReturn(0);
1320 }
1321 
1322 static PetscErrorCode DMCreateMatrix_THI_Tridiagonal(DM da,Mat *J)
1323 {
1324   Mat                    A;
1325   PetscInt               xm,ym,zm,dim,dof = 2,starts[3],dims[3];
1326   ISLocalToGlobalMapping ltog;
1327 
1328   PetscFunctionBeginUser;
1329   PetscCall(DMDAGetInfo(da,&dim, 0,0,0, 0,0,0, 0,0,0,0,0,0));
1330   PetscCheck(dim == 3,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Expected DMDA to be 3D");
1331   PetscCall(DMDAGetCorners(da,0,0,0,&zm,&ym,&xm));
1332   PetscCall(DMGetLocalToGlobalMapping(da,&ltog));
1333   PetscCall(MatCreate(PetscObjectComm((PetscObject)da),&A));
1334   PetscCall(MatSetSizes(A,dof*xm*ym*zm,dof*xm*ym*zm,PETSC_DETERMINE,PETSC_DETERMINE));
1335   PetscCall(MatSetType(A,da->mattype));
1336   PetscCall(MatSetFromOptions(A));
1337   PetscCall(MatSeqAIJSetPreallocation(A,3*2,NULL));
1338   PetscCall(MatMPIAIJSetPreallocation(A,3*2,NULL,0,NULL));
1339   PetscCall(MatSeqBAIJSetPreallocation(A,2,3,NULL));
1340   PetscCall(MatMPIBAIJSetPreallocation(A,2,3,NULL,0,NULL));
1341   PetscCall(MatSeqSBAIJSetPreallocation(A,2,2,NULL));
1342   PetscCall(MatMPISBAIJSetPreallocation(A,2,2,NULL,0,NULL));
1343   PetscCall(MatSetLocalToGlobalMapping(A,ltog,ltog));
1344   PetscCall(DMDAGetGhostCorners(da,&starts[0],&starts[1],&starts[2],&dims[0],&dims[1],&dims[2]));
1345   PetscCall(MatSetStencil(A,dim,dims,starts,dof));
1346   *J   = A;
1347   PetscFunctionReturn(0);
1348 }
1349 
1350 static PetscErrorCode THIDAVecView_VTK_XML(THI thi,DM da,Vec X,const char filename[])
1351 {
1352   const PetscInt    dof   = 2;
1353   Units             units = thi->units;
1354   MPI_Comm          comm;
1355   PetscViewer       viewer;
1356   PetscMPIInt       rank,size,tag,nn,nmax;
1357   PetscInt          mx,my,mz,r,range[6];
1358   const PetscScalar *x;
1359 
1360   PetscFunctionBeginUser;
1361   PetscCall(PetscObjectGetComm((PetscObject)thi,&comm));
1362   PetscCall(DMDAGetInfo(da,0, &mz,&my,&mx, 0,0,0, 0,0,0,0,0,0));
1363   PetscCallMPI(MPI_Comm_size(comm,&size));
1364   PetscCallMPI(MPI_Comm_rank(comm,&rank));
1365   PetscCall(PetscViewerASCIIOpen(comm,filename,&viewer));
1366   PetscCall(PetscViewerASCIIPrintf(viewer,"<VTKFile type=\"StructuredGrid\" version=\"0.1\" byte_order=\"LittleEndian\">\n"));
1367   PetscCall(PetscViewerASCIIPrintf(viewer,"  <StructuredGrid WholeExtent=\"%d %" PetscInt_FMT " %d %" PetscInt_FMT " %d %" PetscInt_FMT "\">\n",0,mz-1,0,my-1,0,mx-1));
1368 
1369   PetscCall(DMDAGetCorners(da,range,range+1,range+2,range+3,range+4,range+5));
1370   PetscCall(PetscMPIIntCast(range[3]*range[4]*range[5]*dof,&nn));
1371   PetscCallMPI(MPI_Reduce(&nn,&nmax,1,MPI_INT,MPI_MAX,0,comm));
1372   tag  = ((PetscObject) viewer)->tag;
1373   PetscCall(VecGetArrayRead(X,&x));
1374   if (rank == 0) {
1375     PetscScalar *array;
1376     PetscCall(PetscMalloc1(nmax,&array));
1377     for (r=0; r<size; r++) {
1378       PetscInt          i,j,k,xs,xm,ys,ym,zs,zm;
1379       const PetscScalar *ptr;
1380       MPI_Status        status;
1381       if (r) {
1382         PetscCallMPI(MPI_Recv(range,6,MPIU_INT,r,tag,comm,MPI_STATUS_IGNORE));
1383       }
1384       zs = range[0];ys = range[1];xs = range[2];zm = range[3];ym = range[4];xm = range[5];
1385       PetscCheck(xm*ym*zm*dof <= nmax,PETSC_COMM_SELF,PETSC_ERR_PLIB,"should not happen");
1386       if (r) {
1387         PetscCallMPI(MPI_Recv(array,nmax,MPIU_SCALAR,r,tag,comm,&status));
1388         PetscCallMPI(MPI_Get_count(&status,MPIU_SCALAR,&nn));
1389         PetscCheck(nn == xm*ym*zm*dof,PETSC_COMM_SELF,PETSC_ERR_PLIB,"should not happen");
1390         ptr = array;
1391       } else ptr = x;
1392       PetscCall(PetscViewerASCIIPrintf(viewer,"    <Piece Extent=\"%" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\">\n",zs,zs+zm-1,ys,ys+ym-1,xs,xs+xm-1));
1393 
1394       PetscCall(PetscViewerASCIIPrintf(viewer,"      <Points>\n"));
1395       PetscCall(PetscViewerASCIIPrintf(viewer,"        <DataArray type=\"Float32\" NumberOfComponents=\"3\" format=\"ascii\">\n"));
1396       for (i=xs; i<xs+xm; i++) {
1397         for (j=ys; j<ys+ym; j++) {
1398           for (k=zs; k<zs+zm; k++) {
1399             PrmNode   p;
1400             PetscReal xx = thi->Lx*i/mx,yy = thi->Ly*j/my,zz;
1401             thi->initialize(thi,xx,yy,&p);
1402             zz   = PetscRealPart(p.b) + PetscRealPart(p.h)*k/(mz-1);
1403             PetscCall(PetscViewerASCIIPrintf(viewer,"%f %f %f\n",(double)xx,(double)yy,(double)zz));
1404           }
1405         }
1406       }
1407       PetscCall(PetscViewerASCIIPrintf(viewer,"        </DataArray>\n"));
1408       PetscCall(PetscViewerASCIIPrintf(viewer,"      </Points>\n"));
1409 
1410       PetscCall(PetscViewerASCIIPrintf(viewer,"      <PointData>\n"));
1411       PetscCall(PetscViewerASCIIPrintf(viewer,"        <DataArray type=\"Float32\" Name=\"velocity\" NumberOfComponents=\"3\" format=\"ascii\">\n"));
1412       for (i=0; i<nn; i+=dof) {
1413         PetscCall(PetscViewerASCIIPrintf(viewer,"%f %f %f\n",(double)(PetscRealPart(ptr[i])*units->year/units->meter),(double)(PetscRealPart(ptr[i+1])*units->year/units->meter),0.0));
1414       }
1415       PetscCall(PetscViewerASCIIPrintf(viewer,"        </DataArray>\n"));
1416 
1417       PetscCall(PetscViewerASCIIPrintf(viewer,"        <DataArray type=\"Int32\" Name=\"rank\" NumberOfComponents=\"1\" format=\"ascii\">\n"));
1418       for (i=0; i<nn; i+=dof) {
1419         PetscCall(PetscViewerASCIIPrintf(viewer,"%" PetscInt_FMT "\n",r));
1420       }
1421       PetscCall(PetscViewerASCIIPrintf(viewer,"        </DataArray>\n"));
1422       PetscCall(PetscViewerASCIIPrintf(viewer,"      </PointData>\n"));
1423 
1424       PetscCall(PetscViewerASCIIPrintf(viewer,"    </Piece>\n"));
1425     }
1426     PetscCall(PetscFree(array));
1427   } else {
1428     PetscCallMPI(MPI_Send(range,6,MPIU_INT,0,tag,comm));
1429     PetscCallMPI(MPI_Send((PetscScalar*)x,nn,MPIU_SCALAR,0,tag,comm));
1430   }
1431   PetscCall(VecRestoreArrayRead(X,&x));
1432   PetscCall(PetscViewerASCIIPrintf(viewer,"  </StructuredGrid>\n"));
1433   PetscCall(PetscViewerASCIIPrintf(viewer,"</VTKFile>\n"));
1434   PetscCall(PetscViewerDestroy(&viewer));
1435   PetscFunctionReturn(0);
1436 }
1437 
1438 int main(int argc,char *argv[])
1439 {
1440   MPI_Comm       comm;
1441   THI            thi;
1442   DM             da;
1443   SNES           snes;
1444 
1445   PetscFunctionBeginUser;
1446   PetscCall(PetscInitialize(&argc,&argv,0,help));
1447   comm = PETSC_COMM_WORLD;
1448 
1449   PetscCall(THICreate(comm,&thi));
1450   {
1451     PetscInt M = 3,N = 3,P = 2;
1452     PetscOptionsBegin(comm,NULL,"Grid resolution options","");
1453     {
1454       PetscCall(PetscOptionsInt("-M","Number of elements in x-direction on coarse level","",M,&M,NULL));
1455       N    = M;
1456       PetscCall(PetscOptionsInt("-N","Number of elements in y-direction on coarse level (if different from M)","",N,&N,NULL));
1457       if (thi->coarse2d) {
1458         PetscCall(PetscOptionsInt("-zlevels","Number of elements in z-direction on fine level","",thi->zlevels,&thi->zlevels,NULL));
1459       } else {
1460         PetscCall(PetscOptionsInt("-P","Number of elements in z-direction on coarse level","",P,&P,NULL));
1461       }
1462     }
1463     PetscOptionsEnd();
1464     if (thi->coarse2d) {
1465       PetscCall(DMDACreate2d(comm,DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC,DMDA_STENCIL_BOX,N,M,PETSC_DETERMINE,PETSC_DETERMINE,sizeof(Node)/sizeof(PetscScalar),1,0,0,&da));
1466       PetscCall(DMSetFromOptions(da));
1467       PetscCall(DMSetUp(da));
1468       da->ops->refinehierarchy     = DMRefineHierarchy_THI;
1469       da->ops->createinterpolation = DMCreateInterpolation_DA_THI;
1470 
1471       PetscCall(PetscObjectCompose((PetscObject)da,"THI",(PetscObject)thi));
1472     } else {
1473       PetscCall(DMDACreate3d(comm,DM_BOUNDARY_NONE,DM_BOUNDARY_PERIODIC,DM_BOUNDARY_PERIODIC, DMDA_STENCIL_BOX,P,N,M,1,PETSC_DETERMINE,PETSC_DETERMINE,sizeof(Node)/sizeof(PetscScalar),1,0,0,0,&da));
1474       PetscCall(DMSetFromOptions(da));
1475       PetscCall(DMSetUp(da));
1476     }
1477     PetscCall(DMDASetFieldName(da,0,"x-velocity"));
1478     PetscCall(DMDASetFieldName(da,1,"y-velocity"));
1479   }
1480   PetscCall(THISetUpDM(thi,da));
1481   if (thi->tridiagonal) da->ops->creatematrix = DMCreateMatrix_THI_Tridiagonal;
1482 
1483   {                             /* Set the fine level matrix type if -da_refine */
1484     PetscInt rlevel,clevel;
1485     PetscCall(DMGetRefineLevel(da,&rlevel));
1486     PetscCall(DMGetCoarsenLevel(da,&clevel));
1487     if (rlevel - clevel > 0) PetscCall(DMSetMatType(da,thi->mattype));
1488   }
1489 
1490   PetscCall(DMDASNESSetFunctionLocal(da,ADD_VALUES,(DMDASNESFunction)THIFunctionLocal,thi));
1491   if (thi->tridiagonal) {
1492     PetscCall(DMDASNESSetJacobianLocal(da,(DMDASNESJacobian)THIJacobianLocal_3D_Tridiagonal,thi));
1493   } else {
1494     PetscCall(DMDASNESSetJacobianLocal(da,(DMDASNESJacobian)THIJacobianLocal_3D_Full,thi));
1495   }
1496   PetscCall(DMCoarsenHookAdd(da,DMCoarsenHook_THI,NULL,thi));
1497   PetscCall(DMRefineHookAdd(da,DMRefineHook_THI,NULL,thi));
1498 
1499   PetscCall(DMSetApplicationContext(da,thi));
1500 
1501   PetscCall(SNESCreate(comm,&snes));
1502   PetscCall(SNESSetDM(snes,da));
1503   PetscCall(DMDestroy(&da));
1504   PetscCall(SNESSetComputeInitialGuess(snes,THIInitial,NULL));
1505   PetscCall(SNESSetFromOptions(snes));
1506 
1507   PetscCall(SNESSolve(snes,NULL,NULL));
1508 
1509   PetscCall(THISolveStatistics(thi,snes,0,"Full"));
1510 
1511   {
1512     PetscBool flg;
1513     char      filename[PETSC_MAX_PATH_LEN] = "";
1514     PetscCall(PetscOptionsGetString(NULL,NULL,"-o",filename,sizeof(filename),&flg));
1515     if (flg) {
1516       Vec X;
1517       DM  dm;
1518       PetscCall(SNESGetSolution(snes,&X));
1519       PetscCall(SNESGetDM(snes,&dm));
1520       PetscCall(THIDAVecView_VTK_XML(thi,dm,X,filename));
1521     }
1522   }
1523 
1524   PetscCall(DMDestroy(&da));
1525   PetscCall(SNESDestroy(&snes));
1526   PetscCall(THIDestroy(&thi));
1527   PetscCall(PetscFinalize());
1528   return 0;
1529 }
1530 
1531 /*TEST
1532 
1533    build:
1534       requires: !single
1535 
1536    test:
1537       args: -M 6 -P 4 -da_refine 1 -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type sbaij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type icc
1538 
1539    test:
1540       suffix: 2
1541       nsize: 2
1542       args: -M 6 -P 4 -thi_hom z -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type sbaij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type asm -mg_levels_pc_asm_blocks 6 -mg_levels_0_pc_type redundant -snes_grid_sequence 1 -mat_partitioning_type current -ksp_atol -1
1543 
1544    test:
1545       suffix: 3
1546       nsize: 3
1547       args: -M 7 -P 4 -thi_hom z -da_refine 1 -snes_monitor_short -snes_converged_reason -ksp_monitor_short -ksp_converged_reason -thi_mat_type baij -ksp_type fgmres -pc_type mg -pc_mg_type full -mg_levels_pc_asm_type restrict -mg_levels_pc_type asm -mg_levels_pc_asm_blocks 9 -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mat_partitioning_type current
1548 
1549    test:
1550       suffix: 4
1551       nsize: 6
1552       args: -M 4 -P 2 -da_refine_hierarchy_x 1,1,3 -da_refine_hierarchy_y 2,2,1 -da_refine_hierarchy_z 2,2,1 -snes_grid_sequence 3 -ksp_converged_reason -ksp_type fgmres -ksp_rtol 1e-2 -pc_type mg -mg_levels_ksp_type gmres -mg_levels_ksp_max_it 1 -mg_levels_pc_type bjacobi -mg_levels_1_sub_pc_type cholesky -pc_mg_type multiplicative -snes_converged_reason -snes_stol 1e-12 -thi_L 80e3 -thi_alpha 0.05 -thi_friction_m 1 -thi_hom x -snes_view -mg_levels_0_pc_type redundant -mg_levels_0_ksp_type preonly -ksp_atol -1
1553 
1554    test:
1555       suffix: 5
1556       nsize: 6
1557       args: -M 12 -P 5 -snes_monitor_short -ksp_converged_reason -pc_type asm -pc_asm_type restrict -dm_mat_type {{aij baij sbaij}}
1558 
1559 TEST*/
1560