xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 174dc0c8cee294b82b85e4dd3b331b29396264fc)
1 /*
2     Provides an interface to the FFTW package.
3     Testing examples can be found in ~src/mat/tests
4 */
5 
6 #include <../src/mat/impls/fft/fft.h> /*I "petscmat.h" I*/
7 EXTERN_C_BEGIN
8 #if !PetscDefined(HAVE_MPIUNI)
9   #include <fftw3-mpi.h>
10 #else
11   #include <fftw3.h>
12 #endif
13 EXTERN_C_END
14 
15 typedef struct {
16   ptrdiff_t ndim_fftw, *dim_fftw;
17 #if defined(PETSC_USE_64BIT_INDICES)
18   fftw_iodim64 *iodims;
19 #else
20   fftw_iodim *iodims;
21 #endif
22   PetscInt     partial_dim;
23   fftw_plan    p_forward, p_backward;
24   unsigned     p_flag;                                      /* planner flags, FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
25   PetscScalar *finarray, *foutarray, *binarray, *boutarray; /* keep track of arrays because fftw_execute() can only be executed for the arrays with which the plan was created */
26 } Mat_FFTW;
27 
28 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat, Vec *, Vec *, Vec *);
29 
30 static PetscErrorCode MatMult_SeqFFTW(Mat A, Vec x, Vec y)
31 {
32   Mat_FFT           *fft  = (Mat_FFT *)A->data;
33   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
34   const PetscScalar *x_array;
35   PetscScalar       *y_array;
36   Vec                xx;
37 #if defined(PETSC_USE_COMPLEX)
38   #if defined(PETSC_USE_64BIT_INDICES)
39   fftw_iodim64 *iodims;
40   #else
41   fftw_iodim *iodims;
42   #endif
43   PetscInt i;
44 #endif
45   PetscInt ndim = fft->ndim, *dim = fft->dim;
46 
47   PetscFunctionBegin;
48   if (!fftw->p_forward && fftw->p_flag != FFTW_ESTIMATE) {
49     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
50     PetscCall(VecDuplicate(x, &xx));
51     PetscCall(VecGetArrayRead(xx, &x_array));
52   } else {
53     PetscCall(VecGetArrayRead(x, &x_array));
54   }
55   PetscCall(VecGetArray(y, &y_array));
56   if (!fftw->p_forward) { /* create a plan, then execute it */
57     switch (ndim) {
58     case 1:
59 #if defined(PETSC_USE_COMPLEX)
60       fftw->p_forward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
61 #else
62       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
63 #endif
64       break;
65     case 2:
66 #if defined(PETSC_USE_COMPLEX)
67       fftw->p_forward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
68 #else
69       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
70 #endif
71       break;
72     case 3:
73 #if defined(PETSC_USE_COMPLEX)
74       fftw->p_forward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
75 #else
76       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
77 #endif
78       break;
79     default:
80 #if defined(PETSC_USE_COMPLEX)
81       iodims = fftw->iodims;
82   #if defined(PETSC_USE_64BIT_INDICES)
83       if (ndim) {
84         iodims[ndim - 1].n  = (ptrdiff_t)dim[ndim - 1];
85         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
86         for (i = ndim - 2; i >= 0; --i) {
87           iodims[i].n  = (ptrdiff_t)dim[i];
88           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
89         }
90       }
91       fftw->p_forward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
92   #else
93       if (ndim) {
94         iodims[ndim - 1].n  = (int)dim[ndim - 1];
95         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
96         for (i = ndim - 2; i >= 0; --i) {
97           iodims[i].n  = (int)dim[i];
98           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
99         }
100       }
101       fftw->p_forward = fftw_plan_guru_dft((int)ndim, (fftw_iodim *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
102   #endif
103 
104 #else
105       fftw->p_forward = fftw_plan_dft_r2c(ndim, (int *)dim, (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
106 #endif
107       break;
108     }
109     if (fftw->p_flag != FFTW_ESTIMATE) {
110       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
111       PetscCall(VecRestoreArrayRead(xx, &x_array));
112       PetscCall(VecDestroy(&xx));
113       PetscCall(VecGetArrayRead(x, &x_array));
114     } else {
115       fftw->finarray  = (PetscScalar *)x_array;
116       fftw->foutarray = y_array;
117     }
118   }
119 
120   if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
121 #if defined(PETSC_USE_COMPLEX)
122     fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
123 #else
124     fftw_execute_dft_r2c(fftw->p_forward, (double *)x_array, (fftw_complex *)y_array);
125 #endif
126   } else {
127     fftw_execute(fftw->p_forward);
128   }
129   PetscCall(VecRestoreArray(y, &y_array));
130   PetscCall(VecRestoreArrayRead(x, &x_array));
131   PetscFunctionReturn(PETSC_SUCCESS);
132 }
133 
134 static PetscErrorCode MatMultTranspose_SeqFFTW(Mat A, Vec x, Vec y)
135 {
136   Mat_FFT           *fft  = (Mat_FFT *)A->data;
137   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
138   const PetscScalar *x_array;
139   PetscScalar       *y_array;
140   PetscInt           ndim = fft->ndim, *dim = fft->dim;
141   Vec                xx;
142 #if defined(PETSC_USE_COMPLEX)
143   #if defined(PETSC_USE_64BIT_INDICES)
144   fftw_iodim64 *iodims = fftw->iodims;
145   #else
146   fftw_iodim *iodims = fftw->iodims;
147   #endif
148 #endif
149 
150   PetscFunctionBegin;
151   if (!fftw->p_backward && fftw->p_flag != FFTW_ESTIMATE) {
152     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
153     PetscCall(VecDuplicate(x, &xx));
154     PetscCall(VecGetArrayRead(xx, &x_array));
155   } else {
156     PetscCall(VecGetArrayRead(x, &x_array));
157   }
158   PetscCall(VecGetArray(y, &y_array));
159   if (!fftw->p_backward) { /* create a plan, then execute it */
160     switch (ndim) {
161     case 1:
162 #if defined(PETSC_USE_COMPLEX)
163       fftw->p_backward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
164 #else
165       fftw->p_backward = fftw_plan_dft_c2r_1d(dim[0], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
166 #endif
167       break;
168     case 2:
169 #if defined(PETSC_USE_COMPLEX)
170       fftw->p_backward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
171 #else
172       fftw->p_backward = fftw_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
173 #endif
174       break;
175     case 3:
176 #if defined(PETSC_USE_COMPLEX)
177       fftw->p_backward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
178 #else
179       fftw->p_backward = fftw_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
180 #endif
181       break;
182     default:
183 #if defined(PETSC_USE_COMPLEX)
184   #if defined(PETSC_USE_64BIT_INDICES)
185       fftw->p_backward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
186   #else
187       fftw->p_backward = fftw_plan_guru_dft((int)ndim, iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
188   #endif
189 #else
190       fftw->p_backward = fftw_plan_dft_c2r((int)ndim, (int *)dim, (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
191 #endif
192       break;
193     }
194     if (fftw->p_flag != FFTW_ESTIMATE) {
195       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
196       PetscCall(VecRestoreArrayRead(xx, &x_array));
197       PetscCall(VecDestroy(&xx));
198       PetscCall(VecGetArrayRead(x, &x_array));
199     } else {
200       fftw->binarray  = (PetscScalar *)x_array;
201       fftw->boutarray = y_array;
202     }
203   }
204   if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
205 #if defined(PETSC_USE_COMPLEX)
206     fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
207 #else
208     fftw_execute_dft_c2r(fftw->p_backward, (fftw_complex *)x_array, (double *)y_array);
209 #endif
210   } else {
211     fftw_execute(fftw->p_backward);
212   }
213   PetscCall(VecRestoreArray(y, &y_array));
214   PetscCall(VecRestoreArrayRead(x, &x_array));
215   PetscFunctionReturn(PETSC_SUCCESS);
216 }
217 
218 #if !PetscDefined(HAVE_MPIUNI)
219 static PetscErrorCode MatMult_MPIFFTW(Mat A, Vec x, Vec y)
220 {
221   Mat_FFT           *fft  = (Mat_FFT *)A->data;
222   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
223   const PetscScalar *x_array;
224   PetscScalar       *y_array;
225   PetscInt           ndim = fft->ndim, *dim = fft->dim;
226   MPI_Comm           comm;
227   Vec                xx;
228 
229   PetscFunctionBegin;
230   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
231   if (!fftw->p_forward && fftw->p_flag != FFTW_ESTIMATE) {
232     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
233     PetscCall(VecDuplicate(x, &xx));
234     PetscCall(VecGetArrayRead(xx, &x_array));
235   } else {
236     PetscCall(VecGetArrayRead(x, &x_array));
237   }
238   PetscCall(VecGetArray(y, &y_array));
239   if (!fftw->p_forward) { /* create a plan, then execute it */
240     switch (ndim) {
241     case 1:
242   #if defined(PETSC_USE_COMPLEX)
243       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
244   #else
245       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
246   #endif
247       break;
248     case 2:
249   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
250       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
251   #else
252       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
253   #endif
254       break;
255     case 3:
256   #if defined(PETSC_USE_COMPLEX)
257       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
258   #else
259       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
260   #endif
261       break;
262     default:
263   #if defined(PETSC_USE_COMPLEX)
264       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
265   #else
266       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw, fftw->dim_fftw, (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
267   #endif
268       break;
269     }
270     if (fftw->p_flag != FFTW_ESTIMATE) {
271       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
272       PetscCall(VecRestoreArrayRead(xx, &x_array));
273       PetscCall(VecDestroy(&xx));
274       PetscCall(VecGetArrayRead(x, &x_array));
275     } else {
276       fftw->finarray  = (PetscScalar *)x_array;
277       fftw->foutarray = y_array;
278     }
279   }
280   if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
281     fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
282   } else {
283     fftw_execute(fftw->p_forward);
284   }
285   PetscCall(VecRestoreArray(y, &y_array));
286   PetscCall(VecRestoreArrayRead(x, &x_array));
287   PetscFunctionReturn(PETSC_SUCCESS);
288 }
289 
290 static PetscErrorCode MatMultTranspose_MPIFFTW(Mat A, Vec x, Vec y)
291 {
292   Mat_FFT           *fft  = (Mat_FFT *)A->data;
293   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
294   const PetscScalar *x_array;
295   PetscScalar       *y_array;
296   PetscInt           ndim = fft->ndim, *dim = fft->dim;
297   MPI_Comm           comm;
298   Vec                xx;
299 
300   PetscFunctionBegin;
301   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
302   if (!fftw->p_backward && fftw->p_flag != FFTW_ESTIMATE) {
303     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
304     PetscCall(VecDuplicate(x, &xx));
305     PetscCall(VecGetArrayRead(xx, &x_array));
306   } else {
307     PetscCall(VecGetArrayRead(x, &x_array));
308   }
309   PetscCall(VecGetArray(y, &y_array));
310   if (!fftw->p_backward) { /* create a plan, then execute it */
311     switch (ndim) {
312     case 1:
313   #if defined(PETSC_USE_COMPLEX)
314       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
315   #else
316       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
317   #endif
318       break;
319     case 2:
320   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
321       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
322   #else
323       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
324   #endif
325       break;
326     case 3:
327   #if defined(PETSC_USE_COMPLEX)
328       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
329   #else
330       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
331   #endif
332       break;
333     default:
334   #if defined(PETSC_USE_COMPLEX)
335       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
336   #else
337       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
338   #endif
339       break;
340     }
341     if (fftw->p_flag != FFTW_ESTIMATE) {
342       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
343       PetscCall(VecRestoreArrayRead(xx, &x_array));
344       PetscCall(VecDestroy(&xx));
345       PetscCall(VecGetArrayRead(x, &x_array));
346     } else {
347       fftw->binarray  = (PetscScalar *)x_array;
348       fftw->boutarray = y_array;
349     }
350   }
351   if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
352     fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
353   } else {
354     fftw_execute(fftw->p_backward);
355   }
356   PetscCall(VecRestoreArray(y, &y_array));
357   PetscCall(VecRestoreArrayRead(x, &x_array));
358   PetscFunctionReturn(PETSC_SUCCESS);
359 }
360 #endif
361 
362 static PetscErrorCode MatDestroy_FFTW(Mat A)
363 {
364   Mat_FFT  *fft  = (Mat_FFT *)A->data;
365   Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
366 
367   PetscFunctionBegin;
368   fftw_destroy_plan(fftw->p_forward);
369   fftw_destroy_plan(fftw->p_backward);
370   if (fftw->iodims) free(fftw->iodims);
371   PetscCall(PetscFree(fftw->dim_fftw));
372   PetscCall(PetscFree(fft->data));
373   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", NULL));
374   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", NULL));
375   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", NULL));
376 #if !PetscDefined(HAVE_MPIUNI)
377   fftw_mpi_cleanup();
378 #endif
379   PetscFunctionReturn(PETSC_SUCCESS);
380 }
381 
382 #if !PetscDefined(HAVE_MPIUNI)
383   #include <../src/vec/vec/impls/mpi/pvecimpl.h> /*I  "petscvec.h"   I*/
384 static PetscErrorCode VecDestroy_MPIFFTW(Vec v)
385 {
386   PetscScalar *array;
387 
388   PetscFunctionBegin;
389   PetscCall(VecGetArray(v, &array));
390   fftw_free((fftw_complex *)array);
391   PetscCall(VecRestoreArray(v, &array));
392   PetscCall(VecDestroy_MPI(v));
393   PetscFunctionReturn(PETSC_SUCCESS);
394 }
395 #endif
396 
397 #if !PetscDefined(HAVE_MPIUNI)
398 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin, Vec *fin_new)
399 {
400   Mat A;
401 
402   PetscFunctionBegin;
403   PetscCall(PetscObjectQuery((PetscObject)fin, "FFTmatrix", (PetscObject *)&A));
404   PetscCall(MatCreateVecsFFTW_FFTW(A, fin_new, NULL, NULL));
405   PetscFunctionReturn(PETSC_SUCCESS);
406 }
407 
408 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout, Vec *fout_new)
409 {
410   Mat A;
411 
412   PetscFunctionBegin;
413   PetscCall(PetscObjectQuery((PetscObject)fout, "FFTmatrix", (PetscObject *)&A));
414   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, fout_new, NULL));
415   PetscFunctionReturn(PETSC_SUCCESS);
416 }
417 
418 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
419 {
420   Mat A;
421 
422   PetscFunctionBegin;
423   PetscCall(PetscObjectQuery((PetscObject)bout, "FFTmatrix", (PetscObject *)&A));
424   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, NULL, bout_new));
425   PetscFunctionReturn(PETSC_SUCCESS);
426 }
427 #endif
428 
429 /*@
430   MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
431   parallel layout determined by `MATFFTW`
432 
433   Collective
434 
435   Input Parameter:
436 . A - the matrix
437 
438   Output Parameters:
439 + x - (optional) input vector of forward FFTW
440 . y - (optional) output vector of forward FFTW
441 - z - (optional) output vector of backward FFTW
442 
443   Options Database Key:
444 . -mat_fftw_plannerflags - set FFTW planner flags
445 
446   Level: advanced
447 
448   Notes:
449   The parallel layout of output of forward FFTW is always same as the input
450   of the backward FFTW. But parallel layout of the input vector of forward
451   FFTW might not be same as the output of backward FFTW.
452 
453   We need to provide enough space while doing parallel real transform.
454   We need to pad extra zeros at the end of last dimension. For this reason the one needs to
455   invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
456   last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
457   depends on if the last dimension is even or odd. If the last dimension is even need to pad two
458   zeros if it is odd only one zero is needed.
459 
460   Lastly one needs some scratch space at the end of data set in each process. alloc_local
461   figures out how much space is needed, i.e. it figures out the data+scratch space for
462   each processor and returns that.
463 
464   Developer Notes:
465   How come `MatCreateVecs()` doesn't produce the correctly padded vectors automatically?
466 
467 .seealso: [](ch_matrices), `Mat`, `MATFFTW`, `MatCreateFFT()`, `MatCreateVecs()`
468 @*/
469 PetscErrorCode MatCreateVecsFFTW(Mat A, Vec *x, Vec *y, Vec *z)
470 {
471   PetscFunctionBegin;
472   PetscUseMethod(A, "MatCreateVecsFFTW_C", (Mat, Vec *, Vec *, Vec *), (A, x, y, z));
473   PetscFunctionReturn(PETSC_SUCCESS);
474 }
475 
476 PetscErrorCode MatCreateVecsFFTW_FFTW(Mat A, Vec *fin, Vec *fout, Vec *bout)
477 {
478   PetscMPIInt size, rank;
479   MPI_Comm    comm;
480   Mat_FFT    *fft = (Mat_FFT *)A->data;
481 
482   PetscFunctionBegin;
483   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
484   PetscValidType(A, 1);
485   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
486 
487   PetscCallMPI(MPI_Comm_size(comm, &size));
488   PetscCallMPI(MPI_Comm_rank(comm, &rank));
489   if (size == 1) { /* sequential case */
490 #if defined(PETSC_USE_COMPLEX)
491     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fin));
492     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fout));
493     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, bout));
494 #else
495     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fin));
496     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fout));
497     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, bout));
498 #endif
499 #if !PetscDefined(HAVE_MPIUNI)
500   } else { /* parallel cases */
501     Mat_FFTW     *fftw = (Mat_FFTW *)fft->data;
502     PetscInt      ndim = fft->ndim, *dim = fft->dim;
503     ptrdiff_t     alloc_local, local_n0, local_0_start;
504     ptrdiff_t     local_n1;
505     fftw_complex *data_fout;
506     ptrdiff_t     local_1_start;
507   #if defined(PETSC_USE_COMPLEX)
508     fftw_complex *data_fin, *data_bout;
509   #else
510     double   *data_finr, *data_boutr;
511     PetscInt  n1, N1;
512     ptrdiff_t temp;
513   #endif
514 
515     switch (ndim) {
516     case 1:
517   #if !defined(PETSC_USE_COMPLEX)
518       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not allow parallel real 1D transform");
519   #else
520       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
521       if (fin) {
522         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
523         PetscCall(VecCreateMPIWithArray(comm, 1, local_n0, fft->N, (const PetscScalar *)data_fin, fin));
524         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
525         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
526         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
527       }
528       if (fout) {
529         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
530         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_fout, fout));
531         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
532         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
533         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
534       }
535       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
536       if (bout) {
537         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
538         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_bout, bout));
539         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
540         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
541         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
542       }
543       break;
544   #endif
545     case 2:
546   #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
547       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
548       N1          = 2 * dim[0] * (dim[1] / 2 + 1);
549       n1          = 2 * local_n0 * (dim[1] / 2 + 1);
550       if (fin) {
551         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
552         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
553         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
554         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
555         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
556       }
557       if (fout) {
558         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
559         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_fout, fout));
560         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
561         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
562         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
563       }
564       if (bout) {
565         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
566         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
567         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
568         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
569         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
570       }
571   #else
572       /* Get local size */
573       alloc_local = fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
574       if (fin) {
575         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
576         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
577         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
578         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
579         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
580       }
581       if (fout) {
582         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
583         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
584         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
585         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
586         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
587       }
588       if (bout) {
589         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
590         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
591         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
592         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
593         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
594       }
595   #endif
596       break;
597     case 3:
598   #if !defined(PETSC_USE_COMPLEX)
599       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
600       N1          = 2 * dim[0] * dim[1] * (dim[2] / 2 + 1);
601       n1          = 2 * local_n0 * dim[1] * (dim[2] / 2 + 1);
602       if (fin) {
603         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
604         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
605         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
606         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
607         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
608       }
609       if (fout) {
610         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
611         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (PetscScalar *)data_fout, fout));
612         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
613         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
614         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
615       }
616       if (bout) {
617         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
618         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
619         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
620         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
621         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
622       }
623   #else
624       alloc_local = fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
625       if (fin) {
626         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
627         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
628         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
629         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
630         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
631       }
632       if (fout) {
633         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
634         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
635         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
636         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
637         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
638       }
639       if (bout) {
640         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
641         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
642         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
643         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
644         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
645       }
646   #endif
647       break;
648     default:
649   #if !defined(PETSC_USE_COMPLEX)
650       temp                                  = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
651       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
652       alloc_local                           = fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
653       N1                                    = 2 * fft->N * (PetscInt)(fftw->dim_fftw[fftw->ndim_fftw - 1]) / ((PetscInt)temp);
654       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
655 
656       if (fin) {
657         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
658         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_finr, fin));
659         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
660         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
661         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
662       }
663       if (fout) {
664         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
665         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_fout, fout));
666         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
667         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
668         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
669       }
670       if (bout) {
671         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
672         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_boutr, bout));
673         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
674         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
675         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
676       }
677   #else
678       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
679       if (fin) {
680         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
681         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
682         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
683         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
684         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
685       }
686       if (fout) {
687         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
688         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
689         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
690         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
691         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
692       }
693       if (bout) {
694         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
695         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
696         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
697         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
698         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
699       }
700   #endif
701       break;
702     }
703     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
704        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
705        memory leaks. We void these pointers here to avoid accident uses.
706     */
707     if (fin) (*fin)->ops->replacearray = NULL;
708     if (fout) (*fout)->ops->replacearray = NULL;
709     if (bout) (*bout)->ops->replacearray = NULL;
710 #endif
711   }
712   PetscFunctionReturn(PETSC_SUCCESS);
713 }
714 
715 /*@
716   VecScatterPetscToFFTW - Copies a PETSc vector to the vector that goes into `MATFFTW` calls.
717 
718   Collective
719 
720   Input Parameters:
721 + A - FFTW matrix
722 - x - the PETSc vector
723 
724   Output Parameter:
725 . y - the FFTW vector
726 
727   Level: intermediate
728 
729   Note:
730   For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
731   one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
732   zeros. This routine does that job by scattering operation.
733 
734 .seealso: [](ch_matrices), `Mat`, `MATFFTW`, `VecScatterFFTWToPetsc()`, `MatCreateVecsFFTW()`
735 @*/
736 PetscErrorCode VecScatterPetscToFFTW(Mat A, Vec x, Vec y)
737 {
738   PetscFunctionBegin;
739   PetscUseMethod(A, "VecScatterPetscToFFTW_C", (Mat, Vec, Vec), (A, x, y));
740   PetscFunctionReturn(PETSC_SUCCESS);
741 }
742 
743 static PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A, Vec x, Vec y)
744 {
745   MPI_Comm    comm;
746   Mat_FFT    *fft = (Mat_FFT *)A->data;
747   PetscInt    low;
748   PetscMPIInt rank, size;
749   PetscInt    vsize, vsize1;
750   VecScatter  vecscat;
751   IS          list1;
752 
753   PetscFunctionBegin;
754   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
755   PetscCallMPI(MPI_Comm_size(comm, &size));
756   PetscCallMPI(MPI_Comm_rank(comm, &rank));
757   PetscCall(VecGetOwnershipRange(y, &low, NULL));
758 
759   if (size == 1) {
760     PetscCall(VecGetSize(x, &vsize));
761     PetscCall(VecGetSize(y, &vsize1));
762     PetscCall(ISCreateStride(PETSC_COMM_SELF, fft->N, 0, 1, &list1));
763     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
764     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
765     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
766     PetscCall(VecScatterDestroy(&vecscat));
767     PetscCall(ISDestroy(&list1));
768 #if !PetscDefined(HAVE_MPIUNI)
769   } else {
770     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
771     PetscInt  ndim = fft->ndim, *dim = fft->dim;
772     ptrdiff_t local_n0, local_0_start;
773     ptrdiff_t local_n1, local_1_start;
774     IS        list2;
775   #if !defined(PETSC_USE_COMPLEX)
776     PetscInt  i, j, k, partial_dim;
777     PetscInt *indx1, *indx2, tempindx, tempindx1;
778     PetscInt  NM;
779     ptrdiff_t temp;
780   #endif
781 
782     switch (ndim) {
783     case 1:
784   #if defined(PETSC_USE_COMPLEX)
785       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
786 
787       PetscCall(ISCreateStride(comm, local_n0, local_0_start, 1, &list1));
788       PetscCall(ISCreateStride(comm, local_n0, low, 1, &list2));
789       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
790       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
791       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
792       PetscCall(VecScatterDestroy(&vecscat));
793       PetscCall(ISDestroy(&list1));
794       PetscCall(ISDestroy(&list2));
795   #else
796       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
797   #endif
798       break;
799     case 2:
800   #if defined(PETSC_USE_COMPLEX)
801       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
802 
803       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
804       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
805       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
806       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
807       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
808       PetscCall(VecScatterDestroy(&vecscat));
809       PetscCall(ISDestroy(&list1));
810       PetscCall(ISDestroy(&list2));
811   #else
812       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
813 
814       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx1));
815       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx2));
816 
817       if (dim[1] % 2 == 0) {
818         NM = dim[1] + 2;
819       } else {
820         NM = dim[1] + 1;
821       }
822       for (i = 0; i < local_n0; i++) {
823         for (j = 0; j < dim[1]; j++) {
824           tempindx  = i * dim[1] + j;
825           tempindx1 = i * NM + j;
826 
827           indx1[tempindx] = local_0_start * dim[1] + tempindx;
828           indx2[tempindx] = low + tempindx1;
829         }
830       }
831 
832       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
833       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
834 
835       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
836       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
837       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
838       PetscCall(VecScatterDestroy(&vecscat));
839       PetscCall(ISDestroy(&list1));
840       PetscCall(ISDestroy(&list2));
841       PetscCall(PetscFree(indx1));
842       PetscCall(PetscFree(indx2));
843   #endif
844       break;
845 
846     case 3:
847   #if defined(PETSC_USE_COMPLEX)
848       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
849 
850       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
851       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
852       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
853       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
854       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
855       PetscCall(VecScatterDestroy(&vecscat));
856       PetscCall(ISDestroy(&list1));
857       PetscCall(ISDestroy(&list2));
858   #else
859       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
860       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 3D real transform");
861       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
862 
863       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx1));
864       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx2));
865 
866       if (dim[2] % 2 == 0) NM = dim[2] + 2;
867       else NM = dim[2] + 1;
868 
869       for (i = 0; i < local_n0; i++) {
870         for (j = 0; j < dim[1]; j++) {
871           for (k = 0; k < dim[2]; k++) {
872             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
873             tempindx1 = i * dim[1] * NM + j * NM + k;
874 
875             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
876             indx2[tempindx] = low + tempindx1;
877           }
878         }
879       }
880 
881       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
882       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
883       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
884       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
885       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
886       PetscCall(VecScatterDestroy(&vecscat));
887       PetscCall(ISDestroy(&list1));
888       PetscCall(ISDestroy(&list2));
889       PetscCall(PetscFree(indx1));
890       PetscCall(PetscFree(indx2));
891   #endif
892       break;
893 
894     default:
895   #if defined(PETSC_USE_COMPLEX)
896       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
897 
898       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
899       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
900       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
901       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
902       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
903       PetscCall(VecScatterDestroy(&vecscat));
904       PetscCall(ISDestroy(&list1));
905       PetscCall(ISDestroy(&list2));
906   #else
907       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
908       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel DIM>3 real transform");
909       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
910 
911       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
912 
913       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
914 
915       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
916 
917       partial_dim = fftw->partial_dim;
918 
919       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx1));
920       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx2));
921 
922       if (dim[ndim - 1] % 2 == 0) NM = 2;
923       else NM = 1;
924 
925       j = low;
926       for (i = 0, k = 1; i < (PetscInt)local_n0 * partial_dim; i++, k++) {
927         indx1[i] = local_0_start * partial_dim + i;
928         indx2[i] = j;
929         if (k % dim[ndim - 1] == 0) j += NM;
930         j++;
931       }
932       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
933       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
934       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
935       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
936       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
937       PetscCall(VecScatterDestroy(&vecscat));
938       PetscCall(ISDestroy(&list1));
939       PetscCall(ISDestroy(&list2));
940       PetscCall(PetscFree(indx1));
941       PetscCall(PetscFree(indx2));
942   #endif
943       break;
944     }
945 #endif
946   }
947   PetscFunctionReturn(PETSC_SUCCESS);
948 }
949 
950 /*@
951   VecScatterFFTWToPetsc - Converts `MATFFTW` output vector to a PETSc vector.
952 
953   Collective
954 
955   Input Parameters:
956 + A - `MATFFTW` matrix
957 - x - FFTW vector
958 
959   Output Parameter:
960 . y - PETSc vector
961 
962   Level: intermediate
963 
964   Note:
965   While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
966   `VecScatterFFTWToPetsc()` removes those extra zeros.
967 
968 .seealso: [](ch_matrices), `Mat`, `VecScatterPetscToFFTW()`, `MATFFTW`, `MatCreateVecsFFTW()`
969 @*/
970 PetscErrorCode VecScatterFFTWToPetsc(Mat A, Vec x, Vec y)
971 {
972   PetscFunctionBegin;
973   PetscUseMethod(A, "VecScatterFFTWToPetsc_C", (Mat, Vec, Vec), (A, x, y));
974   PetscFunctionReturn(PETSC_SUCCESS);
975 }
976 
977 static PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A, Vec x, Vec y)
978 {
979   MPI_Comm    comm;
980   Mat_FFT    *fft = (Mat_FFT *)A->data;
981   PetscInt    low;
982   PetscMPIInt rank, size;
983   VecScatter  vecscat;
984   IS          list1;
985 
986   PetscFunctionBegin;
987   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
988   PetscCallMPI(MPI_Comm_size(comm, &size));
989   PetscCallMPI(MPI_Comm_rank(comm, &rank));
990   PetscCall(VecGetOwnershipRange(x, &low, NULL));
991 
992   if (size == 1) {
993     PetscCall(ISCreateStride(comm, fft->N, 0, 1, &list1));
994     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
995     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
996     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
997     PetscCall(VecScatterDestroy(&vecscat));
998     PetscCall(ISDestroy(&list1));
999 
1000 #if !PetscDefined(HAVE_MPIUNI)
1001   } else {
1002     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
1003     PetscInt  ndim = fft->ndim, *dim = fft->dim;
1004     ptrdiff_t local_n0, local_0_start;
1005     ptrdiff_t local_n1, local_1_start;
1006     IS        list2;
1007   #if !defined(PETSC_USE_COMPLEX)
1008     PetscInt  i, j, k, partial_dim;
1009     PetscInt *indx1, *indx2, tempindx, tempindx1;
1010     PetscInt  NM;
1011     ptrdiff_t temp;
1012   #endif
1013     switch (ndim) {
1014     case 1:
1015   #if defined(PETSC_USE_COMPLEX)
1016       fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1017 
1018       PetscCall(ISCreateStride(comm, local_n1, local_1_start, 1, &list1));
1019       PetscCall(ISCreateStride(comm, local_n1, low, 1, &list2));
1020       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1021       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1022       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1023       PetscCall(VecScatterDestroy(&vecscat));
1024       PetscCall(ISDestroy(&list1));
1025       PetscCall(ISDestroy(&list2));
1026   #else
1027       SETERRQ(comm, PETSC_ERR_SUP, "No support for real parallel 1D FFT");
1028   #endif
1029       break;
1030     case 2:
1031   #if defined(PETSC_USE_COMPLEX)
1032       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1033 
1034       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
1035       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
1036       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1037       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1038       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1039       PetscCall(VecScatterDestroy(&vecscat));
1040       PetscCall(ISDestroy(&list1));
1041       PetscCall(ISDestroy(&list2));
1042   #else
1043       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1044 
1045       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx1));
1046       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx2));
1047 
1048       if (dim[1] % 2 == 0) NM = dim[1] + 2;
1049       else NM = dim[1] + 1;
1050 
1051       for (i = 0; i < local_n0; i++) {
1052         for (j = 0; j < dim[1]; j++) {
1053           tempindx  = i * dim[1] + j;
1054           tempindx1 = i * NM + j;
1055 
1056           indx1[tempindx] = local_0_start * dim[1] + tempindx;
1057           indx2[tempindx] = low + tempindx1;
1058         }
1059       }
1060 
1061       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
1062       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
1063 
1064       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1065       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1066       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1067       PetscCall(VecScatterDestroy(&vecscat));
1068       PetscCall(ISDestroy(&list1));
1069       PetscCall(ISDestroy(&list2));
1070       PetscCall(PetscFree(indx1));
1071       PetscCall(PetscFree(indx2));
1072   #endif
1073       break;
1074     case 3:
1075   #if defined(PETSC_USE_COMPLEX)
1076       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1077 
1078       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
1079       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
1080       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1081       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1082       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1083       PetscCall(VecScatterDestroy(&vecscat));
1084       PetscCall(ISDestroy(&list1));
1085       PetscCall(ISDestroy(&list2));
1086   #else
1087       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1088 
1089       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx1));
1090       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx2));
1091 
1092       if (dim[2] % 2 == 0) NM = dim[2] + 2;
1093       else NM = dim[2] + 1;
1094 
1095       for (i = 0; i < local_n0; i++) {
1096         for (j = 0; j < dim[1]; j++) {
1097           for (k = 0; k < dim[2]; k++) {
1098             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
1099             tempindx1 = i * dim[1] * NM + j * NM + k;
1100 
1101             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
1102             indx2[tempindx] = low + tempindx1;
1103           }
1104         }
1105       }
1106 
1107       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
1108       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
1109 
1110       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1111       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1112       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1113       PetscCall(VecScatterDestroy(&vecscat));
1114       PetscCall(ISDestroy(&list1));
1115       PetscCall(ISDestroy(&list2));
1116       PetscCall(PetscFree(indx1));
1117       PetscCall(PetscFree(indx2));
1118   #endif
1119       break;
1120     default:
1121   #if defined(PETSC_USE_COMPLEX)
1122       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
1123 
1124       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
1125       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
1126       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1127       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1128       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1129       PetscCall(VecScatterDestroy(&vecscat));
1130       PetscCall(ISDestroy(&list1));
1131       PetscCall(ISDestroy(&list2));
1132   #else
1133       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
1134 
1135       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
1136 
1137       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1138 
1139       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
1140 
1141       partial_dim = fftw->partial_dim;
1142 
1143       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx1));
1144       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx2));
1145 
1146       if (dim[ndim - 1] % 2 == 0) NM = 2;
1147       else NM = 1;
1148 
1149       j = low;
1150       for (i = 0, k = 1; i < (PetscInt)local_n0 * partial_dim; i++, k++) {
1151         indx1[i] = local_0_start * partial_dim + i;
1152         indx2[i] = j;
1153         if (k % dim[ndim - 1] == 0) j += NM;
1154         j++;
1155       }
1156       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
1157       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
1158 
1159       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1160       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1161       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1162       PetscCall(VecScatterDestroy(&vecscat));
1163       PetscCall(ISDestroy(&list1));
1164       PetscCall(ISDestroy(&list2));
1165       PetscCall(PetscFree(indx1));
1166       PetscCall(PetscFree(indx2));
1167   #endif
1168       break;
1169     }
1170 #endif
1171   }
1172   PetscFunctionReturn(PETSC_SUCCESS);
1173 }
1174 
1175 /*MC
1176   MATFFTW -  "fftw" - Matrix type that provides FFT via the FFTW external package.
1177 
1178   Level: intermediate
1179 
1180 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatType`, `MatCreateFFT()`
1181 M*/
1182 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1183 {
1184   MPI_Comm    comm;
1185   Mat_FFT    *fft = (Mat_FFT *)A->data;
1186   Mat_FFTW   *fftw;
1187   PetscInt    ndim = fft->ndim, *dim = fft->dim;
1188   const char *plans[]  = {"FFTW_ESTIMATE", "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE"};
1189   unsigned    iplans[] = {FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE};
1190   PetscBool   flg;
1191   PetscInt    p_flag, partial_dim = 1, ctr;
1192   PetscMPIInt size, rank;
1193   ptrdiff_t  *pdim;
1194 #if !defined(PETSC_USE_COMPLEX)
1195   PetscInt tot_dim;
1196 #endif
1197 
1198   PetscFunctionBegin;
1199   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1200   PetscCallMPI(MPI_Comm_size(comm, &size));
1201   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1202 
1203 #if !PetscDefined(HAVE_MPIUNI)
1204   fftw_mpi_init();
1205 #endif
1206   pdim    = (ptrdiff_t *)calloc(ndim, sizeof(ptrdiff_t));
1207   pdim[0] = dim[0];
1208 #if !defined(PETSC_USE_COMPLEX)
1209   tot_dim = 2 * dim[0];
1210 #endif
1211   for (ctr = 1; ctr < ndim; ctr++) {
1212     partial_dim *= dim[ctr];
1213     pdim[ctr] = dim[ctr];
1214 #if !defined(PETSC_USE_COMPLEX)
1215     if (ctr == ndim - 1) tot_dim *= (dim[ctr] / 2 + 1);
1216     else tot_dim *= dim[ctr];
1217 #endif
1218   }
1219 
1220   if (size == 1) {
1221 #if defined(PETSC_USE_COMPLEX)
1222     PetscCall(MatSetSizes(A, fft->N, fft->N, fft->N, fft->N));
1223     fft->n = fft->N;
1224 #else
1225     PetscCall(MatSetSizes(A, tot_dim, tot_dim, tot_dim, tot_dim));
1226     fft->n = tot_dim;
1227 #endif
1228 #if !PetscDefined(HAVE_MPIUNI)
1229   } else {
1230     ptrdiff_t local_n0, local_0_start, local_n1, local_1_start;
1231   #if !defined(PETSC_USE_COMPLEX)
1232     ptrdiff_t temp;
1233     PetscInt  N1;
1234   #endif
1235 
1236     switch (ndim) {
1237     case 1:
1238   #if !defined(PETSC_USE_COMPLEX)
1239       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
1240   #else
1241       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1242       fft->n = (PetscInt)local_n0;
1243       PetscCall(MatSetSizes(A, local_n1, fft->n, fft->N, fft->N));
1244   #endif
1245       break;
1246     case 2:
1247   #if defined(PETSC_USE_COMPLEX)
1248       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1249       fft->n = (PetscInt)local_n0 * dim[1];
1250       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1251   #else
1252       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1253 
1254       fft->n = 2 * (PetscInt)local_n0 * (dim[1] / 2 + 1);
1255       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * (dim[1] / 2 + 1), 2 * dim[0] * (dim[1] / 2 + 1)));
1256   #endif
1257       break;
1258     case 3:
1259   #if defined(PETSC_USE_COMPLEX)
1260       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1261 
1262       fft->n = (PetscInt)local_n0 * dim[1] * dim[2];
1263       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1264   #else
1265       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1266 
1267       fft->n = 2 * (PetscInt)local_n0 * dim[1] * (dim[2] / 2 + 1);
1268       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * dim[1] * (dim[2] / 2 + 1), 2 * dim[0] * dim[1] * (dim[2] / 2 + 1)));
1269   #endif
1270       break;
1271     default:
1272   #if defined(PETSC_USE_COMPLEX)
1273       fftw_mpi_local_size(ndim, pdim, comm, &local_n0, &local_0_start);
1274 
1275       fft->n = (PetscInt)local_n0 * partial_dim;
1276       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1277   #else
1278       temp = pdim[ndim - 1];
1279 
1280       pdim[ndim - 1] = temp / 2 + 1;
1281 
1282       fftw_mpi_local_size_transposed(ndim, pdim, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1283 
1284       fft->n = 2 * (PetscInt)local_n0 * partial_dim * pdim[ndim - 1] / temp;
1285       N1     = 2 * fft->N * (PetscInt)pdim[ndim - 1] / ((PetscInt)temp);
1286 
1287       pdim[ndim - 1] = temp;
1288 
1289       PetscCall(MatSetSizes(A, fft->n, fft->n, N1, N1));
1290   #endif
1291       break;
1292     }
1293 #endif
1294   }
1295   free(pdim);
1296   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATFFTW));
1297   PetscCall(PetscNew(&fftw));
1298   fft->data = (void *)fftw;
1299 
1300   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1301   fftw->partial_dim = partial_dim;
1302 
1303   PetscCall(PetscMalloc1(ndim, &fftw->dim_fftw));
1304   if (size == 1) {
1305 #if defined(PETSC_USE_64BIT_INDICES)
1306     fftw->iodims = (fftw_iodim64 *)malloc(sizeof(fftw_iodim64) * ndim);
1307 #else
1308     fftw->iodims = (fftw_iodim *)malloc(sizeof(fftw_iodim) * ndim);
1309 #endif
1310   }
1311 
1312   for (ctr = 0; ctr < ndim; ctr++) (fftw->dim_fftw)[ctr] = dim[ctr];
1313 
1314   fftw->p_forward  = NULL;
1315   fftw->p_backward = NULL;
1316   fftw->p_flag     = FFTW_ESTIMATE;
1317 
1318   if (size == 1) {
1319     A->ops->mult          = MatMult_SeqFFTW;
1320     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1321 #if !PetscDefined(HAVE_MPIUNI)
1322   } else {
1323     A->ops->mult          = MatMult_MPIFFTW;
1324     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1325 #endif
1326   }
1327   fft->matdestroy = MatDestroy_FFTW;
1328   A->assembled    = PETSC_TRUE;
1329   A->preallocated = PETSC_TRUE;
1330 
1331   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", MatCreateVecsFFTW_FFTW));
1332   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", VecScatterPetscToFFTW_FFTW));
1333   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", VecScatterFFTWToPetsc_FFTW));
1334 
1335   /* get runtime options */
1336   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "FFTW Options", "Mat");
1337   PetscCall(PetscOptionsEList("-mat_fftw_plannerflags", "Planner Flags", "None", plans, 4, plans[0], &p_flag, &flg));
1338   if (flg) fftw->p_flag = iplans[p_flag];
1339   PetscOptionsEnd();
1340   PetscFunctionReturn(PETSC_SUCCESS);
1341 }
1342