xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision bfe80ac4a46d58cb7760074b25f5e81b2f541d8a)
1 /*
2     Provides an interface to the FFTW package.
3     Testing examples can be found in ~src/mat/tests
4 */
5 
6 #include <../src/mat/impls/fft/fft.h> /*I "petscmat.h" I*/
7 EXTERN_C_BEGIN
8 #if !PetscDefined(HAVE_MPIUNI)
9   #include <fftw3-mpi.h>
10 #else
11   #include <fftw3.h>
12 #endif
13 EXTERN_C_END
14 
15 typedef struct {
16   ptrdiff_t ndim_fftw, *dim_fftw;
17 #if defined(PETSC_USE_64BIT_INDICES)
18   fftw_iodim64 *iodims;
19 #else
20   fftw_iodim *iodims;
21 #endif
22   PetscInt     partial_dim;
23   fftw_plan    p_forward, p_backward;
24   unsigned     p_flag;                                      /* planner flags, FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
25   PetscScalar *finarray, *foutarray, *binarray, *boutarray; /* keep track of arrays because fftw_execute() can only be executed for the arrays with which the plan was created */
26 } Mat_FFTW;
27 
28 extern PetscErrorCode MatMult_SeqFFTW(Mat, Vec, Vec);
29 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat, Vec, Vec);
30 extern PetscErrorCode MatDestroy_FFTW(Mat);
31 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat, Vec *, Vec *, Vec *);
32 #if !PetscDefined(HAVE_MPIUNI)
33 extern PetscErrorCode MatMult_MPIFFTW(Mat, Vec, Vec);
34 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat, Vec, Vec);
35 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
36 #endif
37 
38 PetscErrorCode MatMult_SeqFFTW(Mat A, Vec x, Vec y)
39 {
40   Mat_FFT           *fft  = (Mat_FFT *)A->data;
41   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
42   const PetscScalar *x_array;
43   PetscScalar       *y_array;
44   Vec                xx;
45 #if defined(PETSC_USE_COMPLEX)
46   #if defined(PETSC_USE_64BIT_INDICES)
47   fftw_iodim64 *iodims;
48   #else
49   fftw_iodim *iodims;
50   #endif
51   PetscInt i;
52 #endif
53   PetscInt ndim = fft->ndim, *dim = fft->dim;
54 
55   PetscFunctionBegin;
56   if (!fftw->p_forward && fftw->p_flag != FFTW_ESTIMATE) {
57     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
58     PetscCall(VecDuplicate(x, &xx));
59     PetscCall(VecGetArrayRead(xx, &x_array));
60   } else {
61     PetscCall(VecGetArrayRead(x, &x_array));
62   }
63   PetscCall(VecGetArray(y, &y_array));
64   if (!fftw->p_forward) { /* create a plan, then execute it */
65     switch (ndim) {
66     case 1:
67 #if defined(PETSC_USE_COMPLEX)
68       fftw->p_forward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
69 #else
70       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
71 #endif
72       break;
73     case 2:
74 #if defined(PETSC_USE_COMPLEX)
75       fftw->p_forward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
76 #else
77       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
78 #endif
79       break;
80     case 3:
81 #if defined(PETSC_USE_COMPLEX)
82       fftw->p_forward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
83 #else
84       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
85 #endif
86       break;
87     default:
88 #if defined(PETSC_USE_COMPLEX)
89       iodims = fftw->iodims;
90   #if defined(PETSC_USE_64BIT_INDICES)
91       if (ndim) {
92         iodims[ndim - 1].n  = (ptrdiff_t)dim[ndim - 1];
93         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
94         for (i = ndim - 2; i >= 0; --i) {
95           iodims[i].n  = (ptrdiff_t)dim[i];
96           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
97         }
98       }
99       fftw->p_forward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
100   #else
101       if (ndim) {
102         iodims[ndim - 1].n  = (int)dim[ndim - 1];
103         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
104         for (i = ndim - 2; i >= 0; --i) {
105           iodims[i].n  = (int)dim[i];
106           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
107         }
108       }
109       fftw->p_forward = fftw_plan_guru_dft((int)ndim, (fftw_iodim *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
110   #endif
111 
112 #else
113       fftw->p_forward = fftw_plan_dft_r2c(ndim, (int *)dim, (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
114 #endif
115       break;
116     }
117     if (fftw->p_flag != FFTW_ESTIMATE) {
118       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
119       PetscCall(VecRestoreArrayRead(xx, &x_array));
120       PetscCall(VecDestroy(&xx));
121       PetscCall(VecGetArrayRead(x, &x_array));
122     } else {
123       fftw->finarray  = (PetscScalar *)x_array;
124       fftw->foutarray = y_array;
125     }
126   }
127 
128   if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
129 #if defined(PETSC_USE_COMPLEX)
130     fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
131 #else
132     fftw_execute_dft_r2c(fftw->p_forward, (double *)x_array, (fftw_complex *)y_array);
133 #endif
134   } else {
135     fftw_execute(fftw->p_forward);
136   }
137   PetscCall(VecRestoreArray(y, &y_array));
138   PetscCall(VecRestoreArrayRead(x, &x_array));
139   PetscFunctionReturn(PETSC_SUCCESS);
140 }
141 
142 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A, Vec x, Vec y)
143 {
144   Mat_FFT           *fft  = (Mat_FFT *)A->data;
145   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
146   const PetscScalar *x_array;
147   PetscScalar       *y_array;
148   PetscInt           ndim = fft->ndim, *dim = fft->dim;
149   Vec                xx;
150 #if defined(PETSC_USE_COMPLEX)
151   #if defined(PETSC_USE_64BIT_INDICES)
152   fftw_iodim64 *iodims = fftw->iodims;
153   #else
154   fftw_iodim *iodims = fftw->iodims;
155   #endif
156 #endif
157 
158   PetscFunctionBegin;
159   if (!fftw->p_backward && fftw->p_flag != FFTW_ESTIMATE) {
160     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
161     PetscCall(VecDuplicate(x, &xx));
162     PetscCall(VecGetArrayRead(xx, &x_array));
163   } else {
164     PetscCall(VecGetArrayRead(x, &x_array));
165   }
166   PetscCall(VecGetArray(y, &y_array));
167   if (!fftw->p_backward) { /* create a plan, then execute it */
168     switch (ndim) {
169     case 1:
170 #if defined(PETSC_USE_COMPLEX)
171       fftw->p_backward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
172 #else
173       fftw->p_backward = fftw_plan_dft_c2r_1d(dim[0], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
174 #endif
175       break;
176     case 2:
177 #if defined(PETSC_USE_COMPLEX)
178       fftw->p_backward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
179 #else
180       fftw->p_backward = fftw_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
181 #endif
182       break;
183     case 3:
184 #if defined(PETSC_USE_COMPLEX)
185       fftw->p_backward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
186 #else
187       fftw->p_backward = fftw_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
188 #endif
189       break;
190     default:
191 #if defined(PETSC_USE_COMPLEX)
192   #if defined(PETSC_USE_64BIT_INDICES)
193       fftw->p_backward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
194   #else
195       fftw->p_backward = fftw_plan_guru_dft((int)ndim, iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
196   #endif
197 #else
198       fftw->p_backward = fftw_plan_dft_c2r((int)ndim, (int *)dim, (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
199 #endif
200       break;
201     }
202     if (fftw->p_flag != FFTW_ESTIMATE) {
203       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
204       PetscCall(VecRestoreArrayRead(xx, &x_array));
205       PetscCall(VecDestroy(&xx));
206       PetscCall(VecGetArrayRead(x, &x_array));
207     } else {
208       fftw->binarray  = (PetscScalar *)x_array;
209       fftw->boutarray = y_array;
210     }
211   }
212   if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
213 #if defined(PETSC_USE_COMPLEX)
214     fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
215 #else
216     fftw_execute_dft_c2r(fftw->p_backward, (fftw_complex *)x_array, (double *)y_array);
217 #endif
218   } else {
219     fftw_execute(fftw->p_backward);
220   }
221   PetscCall(VecRestoreArray(y, &y_array));
222   PetscCall(VecRestoreArrayRead(x, &x_array));
223   PetscFunctionReturn(PETSC_SUCCESS);
224 }
225 
226 #if !PetscDefined(HAVE_MPIUNI)
227 PetscErrorCode MatMult_MPIFFTW(Mat A, Vec x, Vec y)
228 {
229   Mat_FFT           *fft  = (Mat_FFT *)A->data;
230   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
231   const PetscScalar *x_array;
232   PetscScalar       *y_array;
233   PetscInt           ndim = fft->ndim, *dim = fft->dim;
234   MPI_Comm           comm;
235   Vec                xx;
236 
237   PetscFunctionBegin;
238   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
239   if (!fftw->p_forward && fftw->p_flag != FFTW_ESTIMATE) {
240     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
241     PetscCall(VecDuplicate(x, &xx));
242     PetscCall(VecGetArrayRead(xx, &x_array));
243   } else {
244     PetscCall(VecGetArrayRead(x, &x_array));
245   }
246   PetscCall(VecGetArray(y, &y_array));
247   if (!fftw->p_forward) { /* create a plan, then execute it */
248     switch (ndim) {
249     case 1:
250   #if defined(PETSC_USE_COMPLEX)
251       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
252   #else
253       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
254   #endif
255       break;
256     case 2:
257   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
258       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
259   #else
260       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
261   #endif
262       break;
263     case 3:
264   #if defined(PETSC_USE_COMPLEX)
265       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
266   #else
267       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
268   #endif
269       break;
270     default:
271   #if defined(PETSC_USE_COMPLEX)
272       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
273   #else
274       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw, fftw->dim_fftw, (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
275   #endif
276       break;
277     }
278     if (fftw->p_flag != FFTW_ESTIMATE) {
279       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
280       PetscCall(VecRestoreArrayRead(xx, &x_array));
281       PetscCall(VecDestroy(&xx));
282       PetscCall(VecGetArrayRead(x, &x_array));
283     } else {
284       fftw->finarray  = (PetscScalar *)x_array;
285       fftw->foutarray = y_array;
286     }
287   }
288   if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
289     fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
290   } else {
291     fftw_execute(fftw->p_forward);
292   }
293   PetscCall(VecRestoreArray(y, &y_array));
294   PetscCall(VecRestoreArrayRead(x, &x_array));
295   PetscFunctionReturn(PETSC_SUCCESS);
296 }
297 
298 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A, Vec x, Vec y)
299 {
300   Mat_FFT           *fft  = (Mat_FFT *)A->data;
301   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
302   const PetscScalar *x_array;
303   PetscScalar       *y_array;
304   PetscInt           ndim = fft->ndim, *dim = fft->dim;
305   MPI_Comm           comm;
306   Vec                xx;
307 
308   PetscFunctionBegin;
309   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
310   if (!fftw->p_backward && fftw->p_flag != FFTW_ESTIMATE) {
311     /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
312     PetscCall(VecDuplicate(x, &xx));
313     PetscCall(VecGetArrayRead(xx, &x_array));
314   } else {
315     PetscCall(VecGetArrayRead(x, &x_array));
316   }
317   PetscCall(VecGetArray(y, &y_array));
318   if (!fftw->p_backward) { /* create a plan, then execute it */
319     switch (ndim) {
320     case 1:
321   #if defined(PETSC_USE_COMPLEX)
322       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
323   #else
324       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
325   #endif
326       break;
327     case 2:
328   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
329       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
330   #else
331       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
332   #endif
333       break;
334     case 3:
335   #if defined(PETSC_USE_COMPLEX)
336       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
337   #else
338       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
339   #endif
340       break;
341     default:
342   #if defined(PETSC_USE_COMPLEX)
343       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
344   #else
345       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
346   #endif
347       break;
348     }
349     if (fftw->p_flag != FFTW_ESTIMATE) {
350       /* The data in the in/out arrays is overwritten so need a dummy array for computation, see FFTW manual sec2.1 or sec4 */
351       PetscCall(VecRestoreArrayRead(xx, &x_array));
352       PetscCall(VecDestroy(&xx));
353       PetscCall(VecGetArrayRead(x, &x_array));
354     } else {
355       fftw->binarray  = (PetscScalar *)x_array;
356       fftw->boutarray = y_array;
357     }
358   }
359   if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
360     fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
361   } else {
362     fftw_execute(fftw->p_backward);
363   }
364   PetscCall(VecRestoreArray(y, &y_array));
365   PetscCall(VecRestoreArrayRead(x, &x_array));
366   PetscFunctionReturn(PETSC_SUCCESS);
367 }
368 #endif
369 
370 PetscErrorCode MatDestroy_FFTW(Mat A)
371 {
372   Mat_FFT  *fft  = (Mat_FFT *)A->data;
373   Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
374 
375   PetscFunctionBegin;
376   fftw_destroy_plan(fftw->p_forward);
377   fftw_destroy_plan(fftw->p_backward);
378   if (fftw->iodims) free(fftw->iodims);
379   PetscCall(PetscFree(fftw->dim_fftw));
380   PetscCall(PetscFree(fft->data));
381   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", NULL));
382   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", NULL));
383   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", NULL));
384 #if !PetscDefined(HAVE_MPIUNI)
385   fftw_mpi_cleanup();
386 #endif
387   PetscFunctionReturn(PETSC_SUCCESS);
388 }
389 
390 #if !PetscDefined(HAVE_MPIUNI)
391   #include <../src/vec/vec/impls/mpi/pvecimpl.h> /*I  "petscvec.h"   I*/
392 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
393 {
394   PetscScalar *array;
395 
396   PetscFunctionBegin;
397   PetscCall(VecGetArray(v, &array));
398   fftw_free((fftw_complex *)array);
399   PetscCall(VecRestoreArray(v, &array));
400   PetscCall(VecDestroy_MPI(v));
401   PetscFunctionReturn(PETSC_SUCCESS);
402 }
403 #endif
404 
405 #if !PetscDefined(HAVE_MPIUNI)
406 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin, Vec *fin_new)
407 {
408   Mat A;
409 
410   PetscFunctionBegin;
411   PetscCall(PetscObjectQuery((PetscObject)fin, "FFTmatrix", (PetscObject *)&A));
412   PetscCall(MatCreateVecsFFTW_FFTW(A, fin_new, NULL, NULL));
413   PetscFunctionReturn(PETSC_SUCCESS);
414 }
415 
416 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout, Vec *fout_new)
417 {
418   Mat A;
419 
420   PetscFunctionBegin;
421   PetscCall(PetscObjectQuery((PetscObject)fout, "FFTmatrix", (PetscObject *)&A));
422   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, fout_new, NULL));
423   PetscFunctionReturn(PETSC_SUCCESS);
424 }
425 
426 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
427 {
428   Mat A;
429 
430   PetscFunctionBegin;
431   PetscCall(PetscObjectQuery((PetscObject)bout, "FFTmatrix", (PetscObject *)&A));
432   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, NULL, bout_new));
433   PetscFunctionReturn(PETSC_SUCCESS);
434 }
435 #endif
436 
437 /*@
438   MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
439   parallel layout determined by `MATFFTW`
440 
441   Collective
442 
443   Input Parameter:
444 . A - the matrix
445 
446   Output Parameters:
447 + x - (optional) input vector of forward FFTW
448 . y - (optional) output vector of forward FFTW
449 - z - (optional) output vector of backward FFTW
450 
451   Options Database Key:
452 . -mat_fftw_plannerflags - set FFTW planner flags
453 
454   Level: advanced
455 
456   Notes:
457   The parallel layout of output of forward FFTW is always same as the input
458   of the backward FFTW. But parallel layout of the input vector of forward
459   FFTW might not be same as the output of backward FFTW.
460 
461   We need to provide enough space while doing parallel real transform.
462   We need to pad extra zeros at the end of last dimension. For this reason the one needs to
463   invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
464   last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
465   depends on if the last dimension is even or odd. If the last dimension is even need to pad two
466   zeros if it is odd only one zero is needed.
467 
468   Lastly one needs some scratch space at the end of data set in each process. alloc_local
469   figures out how much space is needed, i.e. it figures out the data+scratch space for
470   each processor and returns that.
471 
472   Developer Notes:
473   How come `MatCreateVecs()` doesn't produce the correctly padded vectors automatically?
474 
475 .seealso: [](ch_matrices), `Mat`, `MATFFTW`, `MatCreateFFT()`, `MatCreateVecs()`
476 @*/
477 PetscErrorCode MatCreateVecsFFTW(Mat A, Vec *x, Vec *y, Vec *z)
478 {
479   PetscFunctionBegin;
480   PetscUseMethod(A, "MatCreateVecsFFTW_C", (Mat, Vec *, Vec *, Vec *), (A, x, y, z));
481   PetscFunctionReturn(PETSC_SUCCESS);
482 }
483 
484 PetscErrorCode MatCreateVecsFFTW_FFTW(Mat A, Vec *fin, Vec *fout, Vec *bout)
485 {
486   PetscMPIInt size, rank;
487   MPI_Comm    comm;
488   Mat_FFT    *fft = (Mat_FFT *)A->data;
489 
490   PetscFunctionBegin;
491   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
492   PetscValidType(A, 1);
493   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
494 
495   PetscCallMPI(MPI_Comm_size(comm, &size));
496   PetscCallMPI(MPI_Comm_rank(comm, &rank));
497   if (size == 1) { /* sequential case */
498 #if defined(PETSC_USE_COMPLEX)
499     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fin));
500     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fout));
501     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, bout));
502 #else
503     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fin));
504     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fout));
505     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, bout));
506 #endif
507 #if !PetscDefined(HAVE_MPIUNI)
508   } else { /* parallel cases */
509     Mat_FFTW     *fftw = (Mat_FFTW *)fft->data;
510     PetscInt      ndim = fft->ndim, *dim = fft->dim;
511     ptrdiff_t     alloc_local, local_n0, local_0_start;
512     ptrdiff_t     local_n1;
513     fftw_complex *data_fout;
514     ptrdiff_t     local_1_start;
515   #if defined(PETSC_USE_COMPLEX)
516     fftw_complex *data_fin, *data_bout;
517   #else
518     double   *data_finr, *data_boutr;
519     PetscInt  n1, N1;
520     ptrdiff_t temp;
521   #endif
522 
523     switch (ndim) {
524     case 1:
525   #if !defined(PETSC_USE_COMPLEX)
526       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not allow parallel real 1D transform");
527   #else
528       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
529       if (fin) {
530         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
531         PetscCall(VecCreateMPIWithArray(comm, 1, local_n0, fft->N, (const PetscScalar *)data_fin, fin));
532         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
533         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
534         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
535       }
536       if (fout) {
537         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
538         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_fout, fout));
539         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
540         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
541         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
542       }
543       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
544       if (bout) {
545         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
546         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_bout, bout));
547         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
548         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
549         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
550       }
551       break;
552   #endif
553     case 2:
554   #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
555       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
556       N1          = 2 * dim[0] * (dim[1] / 2 + 1);
557       n1          = 2 * local_n0 * (dim[1] / 2 + 1);
558       if (fin) {
559         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
560         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
561         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
562         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
563         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
564       }
565       if (fout) {
566         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
567         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_fout, fout));
568         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
569         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
570         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
571       }
572       if (bout) {
573         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
574         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
575         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
576         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
577         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
578       }
579   #else
580       /* Get local size */
581       alloc_local = fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
582       if (fin) {
583         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
584         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
585         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
586         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
587         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
588       }
589       if (fout) {
590         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
591         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
592         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
593         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
594         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
595       }
596       if (bout) {
597         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
598         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
599         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
600         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
601         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
602       }
603   #endif
604       break;
605     case 3:
606   #if !defined(PETSC_USE_COMPLEX)
607       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
608       N1          = 2 * dim[0] * dim[1] * (dim[2] / 2 + 1);
609       n1          = 2 * local_n0 * dim[1] * (dim[2] / 2 + 1);
610       if (fin) {
611         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
612         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
613         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
614         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
615         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
616       }
617       if (fout) {
618         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
619         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (PetscScalar *)data_fout, fout));
620         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
621         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
622         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
623       }
624       if (bout) {
625         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
626         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
627         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
628         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
629         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
630       }
631   #else
632       alloc_local = fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
633       if (fin) {
634         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
635         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
636         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
637         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
638         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
639       }
640       if (fout) {
641         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
642         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
643         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
644         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
645         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
646       }
647       if (bout) {
648         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
649         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
650         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
651         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
652         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
653       }
654   #endif
655       break;
656     default:
657   #if !defined(PETSC_USE_COMPLEX)
658       temp                                  = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
659       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
660       alloc_local                           = fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
661       N1                                    = 2 * fft->N * (PetscInt)(fftw->dim_fftw[fftw->ndim_fftw - 1]) / ((PetscInt)temp);
662       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
663 
664       if (fin) {
665         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
666         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_finr, fin));
667         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
668         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
669         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
670       }
671       if (fout) {
672         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
673         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_fout, fout));
674         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
675         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
676         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
677       }
678       if (bout) {
679         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
680         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_boutr, bout));
681         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
682         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
683         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
684       }
685   #else
686       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
687       if (fin) {
688         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
689         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
690         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
691         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
692         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
693       }
694       if (fout) {
695         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
696         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
697         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
698         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
699         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
700       }
701       if (bout) {
702         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
703         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
704         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
705         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
706         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
707       }
708   #endif
709       break;
710     }
711     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
712        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
713        memory leaks. We void these pointers here to avoid accident uses.
714     */
715     if (fin) (*fin)->ops->replacearray = NULL;
716     if (fout) (*fout)->ops->replacearray = NULL;
717     if (bout) (*bout)->ops->replacearray = NULL;
718 #endif
719   }
720   PetscFunctionReturn(PETSC_SUCCESS);
721 }
722 
723 /*@
724   VecScatterPetscToFFTW - Copies a PETSc vector to the vector that goes into `MATFFTW` calls.
725 
726   Collective
727 
728   Input Parameters:
729 + A - FFTW matrix
730 - x - the PETSc vector
731 
732   Output Parameter:
733 . y - the FFTW vector
734 
735   Level: intermediate
736 
737   Note:
738   For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
739   one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
740   zeros. This routine does that job by scattering operation.
741 
742 .seealso: [](ch_matrices), `Mat`, `MATFFTW`, `VecScatterFFTWToPetsc()`, `MatCreateVecsFFTW()`
743 @*/
744 PetscErrorCode VecScatterPetscToFFTW(Mat A, Vec x, Vec y)
745 {
746   PetscFunctionBegin;
747   PetscUseMethod(A, "VecScatterPetscToFFTW_C", (Mat, Vec, Vec), (A, x, y));
748   PetscFunctionReturn(PETSC_SUCCESS);
749 }
750 
751 static PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A, Vec x, Vec y)
752 {
753   MPI_Comm    comm;
754   Mat_FFT    *fft = (Mat_FFT *)A->data;
755   PetscInt    low;
756   PetscMPIInt rank, size;
757   PetscInt    vsize, vsize1;
758   VecScatter  vecscat;
759   IS          list1;
760 
761   PetscFunctionBegin;
762   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
763   PetscCallMPI(MPI_Comm_size(comm, &size));
764   PetscCallMPI(MPI_Comm_rank(comm, &rank));
765   PetscCall(VecGetOwnershipRange(y, &low, NULL));
766 
767   if (size == 1) {
768     PetscCall(VecGetSize(x, &vsize));
769     PetscCall(VecGetSize(y, &vsize1));
770     PetscCall(ISCreateStride(PETSC_COMM_SELF, fft->N, 0, 1, &list1));
771     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
772     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
773     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
774     PetscCall(VecScatterDestroy(&vecscat));
775     PetscCall(ISDestroy(&list1));
776 #if !PetscDefined(HAVE_MPIUNI)
777   } else {
778     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
779     PetscInt  ndim = fft->ndim, *dim = fft->dim;
780     ptrdiff_t local_n0, local_0_start;
781     ptrdiff_t local_n1, local_1_start;
782     IS        list2;
783   #if !defined(PETSC_USE_COMPLEX)
784     PetscInt  i, j, k, partial_dim;
785     PetscInt *indx1, *indx2, tempindx, tempindx1;
786     PetscInt  NM;
787     ptrdiff_t temp;
788   #endif
789 
790     switch (ndim) {
791     case 1:
792   #if defined(PETSC_USE_COMPLEX)
793       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
794 
795       PetscCall(ISCreateStride(comm, local_n0, local_0_start, 1, &list1));
796       PetscCall(ISCreateStride(comm, local_n0, low, 1, &list2));
797       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
798       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
799       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
800       PetscCall(VecScatterDestroy(&vecscat));
801       PetscCall(ISDestroy(&list1));
802       PetscCall(ISDestroy(&list2));
803   #else
804       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
805   #endif
806       break;
807     case 2:
808   #if defined(PETSC_USE_COMPLEX)
809       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
810 
811       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
812       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
813       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
814       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
815       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
816       PetscCall(VecScatterDestroy(&vecscat));
817       PetscCall(ISDestroy(&list1));
818       PetscCall(ISDestroy(&list2));
819   #else
820       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
821 
822       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx1));
823       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx2));
824 
825       if (dim[1] % 2 == 0) {
826         NM = dim[1] + 2;
827       } else {
828         NM = dim[1] + 1;
829       }
830       for (i = 0; i < local_n0; i++) {
831         for (j = 0; j < dim[1]; j++) {
832           tempindx  = i * dim[1] + j;
833           tempindx1 = i * NM + j;
834 
835           indx1[tempindx] = local_0_start * dim[1] + tempindx;
836           indx2[tempindx] = low + tempindx1;
837         }
838       }
839 
840       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
841       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
842 
843       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
844       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
845       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
846       PetscCall(VecScatterDestroy(&vecscat));
847       PetscCall(ISDestroy(&list1));
848       PetscCall(ISDestroy(&list2));
849       PetscCall(PetscFree(indx1));
850       PetscCall(PetscFree(indx2));
851   #endif
852       break;
853 
854     case 3:
855   #if defined(PETSC_USE_COMPLEX)
856       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
857 
858       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
859       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
860       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
861       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
862       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
863       PetscCall(VecScatterDestroy(&vecscat));
864       PetscCall(ISDestroy(&list1));
865       PetscCall(ISDestroy(&list2));
866   #else
867       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
868       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 3D real transform");
869       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
870 
871       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx1));
872       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx2));
873 
874       if (dim[2] % 2 == 0) NM = dim[2] + 2;
875       else NM = dim[2] + 1;
876 
877       for (i = 0; i < local_n0; i++) {
878         for (j = 0; j < dim[1]; j++) {
879           for (k = 0; k < dim[2]; k++) {
880             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
881             tempindx1 = i * dim[1] * NM + j * NM + k;
882 
883             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
884             indx2[tempindx] = low + tempindx1;
885           }
886         }
887       }
888 
889       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
890       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
891       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
892       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
893       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
894       PetscCall(VecScatterDestroy(&vecscat));
895       PetscCall(ISDestroy(&list1));
896       PetscCall(ISDestroy(&list2));
897       PetscCall(PetscFree(indx1));
898       PetscCall(PetscFree(indx2));
899   #endif
900       break;
901 
902     default:
903   #if defined(PETSC_USE_COMPLEX)
904       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
905 
906       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
907       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
908       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
909       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
910       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
911       PetscCall(VecScatterDestroy(&vecscat));
912       PetscCall(ISDestroy(&list1));
913       PetscCall(ISDestroy(&list2));
914   #else
915       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
916       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel DIM>3 real transform");
917       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
918 
919       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
920 
921       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
922 
923       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
924 
925       partial_dim = fftw->partial_dim;
926 
927       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx1));
928       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx2));
929 
930       if (dim[ndim - 1] % 2 == 0) NM = 2;
931       else NM = 1;
932 
933       j = low;
934       for (i = 0, k = 1; i < (PetscInt)local_n0 * partial_dim; i++, k++) {
935         indx1[i] = local_0_start * partial_dim + i;
936         indx2[i] = j;
937         if (k % dim[ndim - 1] == 0) j += NM;
938         j++;
939       }
940       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
941       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
942       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
943       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
944       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
945       PetscCall(VecScatterDestroy(&vecscat));
946       PetscCall(ISDestroy(&list1));
947       PetscCall(ISDestroy(&list2));
948       PetscCall(PetscFree(indx1));
949       PetscCall(PetscFree(indx2));
950   #endif
951       break;
952     }
953 #endif
954   }
955   PetscFunctionReturn(PETSC_SUCCESS);
956 }
957 
958 /*@
959   VecScatterFFTWToPetsc - Converts `MATFFTW` output vector to a PETSc vector.
960 
961   Collective
962 
963   Input Parameters:
964 + A - `MATFFTW` matrix
965 - x - FFTW vector
966 
967   Output Parameter:
968 . y - PETSc vector
969 
970   Level: intermediate
971 
972   Note:
973   While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
974   `VecScatterFFTWToPetsc()` removes those extra zeros.
975 
976 .seealso: [](ch_matrices), `Mat`, `VecScatterPetscToFFTW()`, `MATFFTW`, `MatCreateVecsFFTW()`
977 @*/
978 PetscErrorCode VecScatterFFTWToPetsc(Mat A, Vec x, Vec y)
979 {
980   PetscFunctionBegin;
981   PetscUseMethod(A, "VecScatterFFTWToPetsc_C", (Mat, Vec, Vec), (A, x, y));
982   PetscFunctionReturn(PETSC_SUCCESS);
983 }
984 
985 static PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A, Vec x, Vec y)
986 {
987   MPI_Comm    comm;
988   Mat_FFT    *fft = (Mat_FFT *)A->data;
989   PetscInt    low;
990   PetscMPIInt rank, size;
991   VecScatter  vecscat;
992   IS          list1;
993 
994   PetscFunctionBegin;
995   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
996   PetscCallMPI(MPI_Comm_size(comm, &size));
997   PetscCallMPI(MPI_Comm_rank(comm, &rank));
998   PetscCall(VecGetOwnershipRange(x, &low, NULL));
999 
1000   if (size == 1) {
1001     PetscCall(ISCreateStride(comm, fft->N, 0, 1, &list1));
1002     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
1003     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1004     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1005     PetscCall(VecScatterDestroy(&vecscat));
1006     PetscCall(ISDestroy(&list1));
1007 
1008 #if !PetscDefined(HAVE_MPIUNI)
1009   } else {
1010     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
1011     PetscInt  ndim = fft->ndim, *dim = fft->dim;
1012     ptrdiff_t local_n0, local_0_start;
1013     ptrdiff_t local_n1, local_1_start;
1014     IS        list2;
1015   #if !defined(PETSC_USE_COMPLEX)
1016     PetscInt  i, j, k, partial_dim;
1017     PetscInt *indx1, *indx2, tempindx, tempindx1;
1018     PetscInt  NM;
1019     ptrdiff_t temp;
1020   #endif
1021     switch (ndim) {
1022     case 1:
1023   #if defined(PETSC_USE_COMPLEX)
1024       fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1025 
1026       PetscCall(ISCreateStride(comm, local_n1, local_1_start, 1, &list1));
1027       PetscCall(ISCreateStride(comm, local_n1, low, 1, &list2));
1028       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1029       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1030       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1031       PetscCall(VecScatterDestroy(&vecscat));
1032       PetscCall(ISDestroy(&list1));
1033       PetscCall(ISDestroy(&list2));
1034   #else
1035       SETERRQ(comm, PETSC_ERR_SUP, "No support for real parallel 1D FFT");
1036   #endif
1037       break;
1038     case 2:
1039   #if defined(PETSC_USE_COMPLEX)
1040       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1041 
1042       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
1043       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
1044       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1045       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1046       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1047       PetscCall(VecScatterDestroy(&vecscat));
1048       PetscCall(ISDestroy(&list1));
1049       PetscCall(ISDestroy(&list2));
1050   #else
1051       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1052 
1053       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx1));
1054       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1], &indx2));
1055 
1056       if (dim[1] % 2 == 0) NM = dim[1] + 2;
1057       else NM = dim[1] + 1;
1058 
1059       for (i = 0; i < local_n0; i++) {
1060         for (j = 0; j < dim[1]; j++) {
1061           tempindx  = i * dim[1] + j;
1062           tempindx1 = i * NM + j;
1063 
1064           indx1[tempindx] = local_0_start * dim[1] + tempindx;
1065           indx2[tempindx] = low + tempindx1;
1066         }
1067       }
1068 
1069       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
1070       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
1071 
1072       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1073       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1074       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1075       PetscCall(VecScatterDestroy(&vecscat));
1076       PetscCall(ISDestroy(&list1));
1077       PetscCall(ISDestroy(&list2));
1078       PetscCall(PetscFree(indx1));
1079       PetscCall(PetscFree(indx2));
1080   #endif
1081       break;
1082     case 3:
1083   #if defined(PETSC_USE_COMPLEX)
1084       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1085 
1086       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
1087       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
1088       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1089       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1090       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1091       PetscCall(VecScatterDestroy(&vecscat));
1092       PetscCall(ISDestroy(&list1));
1093       PetscCall(ISDestroy(&list2));
1094   #else
1095       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1096 
1097       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx1));
1098       PetscCall(PetscMalloc1((PetscInt)local_n0 * dim[1] * dim[2], &indx2));
1099 
1100       if (dim[2] % 2 == 0) NM = dim[2] + 2;
1101       else NM = dim[2] + 1;
1102 
1103       for (i = 0; i < local_n0; i++) {
1104         for (j = 0; j < dim[1]; j++) {
1105           for (k = 0; k < dim[2]; k++) {
1106             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
1107             tempindx1 = i * dim[1] * NM + j * NM + k;
1108 
1109             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
1110             indx2[tempindx] = low + tempindx1;
1111           }
1112         }
1113       }
1114 
1115       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
1116       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
1117 
1118       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1119       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1120       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1121       PetscCall(VecScatterDestroy(&vecscat));
1122       PetscCall(ISDestroy(&list1));
1123       PetscCall(ISDestroy(&list2));
1124       PetscCall(PetscFree(indx1));
1125       PetscCall(PetscFree(indx2));
1126   #endif
1127       break;
1128     default:
1129   #if defined(PETSC_USE_COMPLEX)
1130       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
1131 
1132       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
1133       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
1134       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1135       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1136       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1137       PetscCall(VecScatterDestroy(&vecscat));
1138       PetscCall(ISDestroy(&list1));
1139       PetscCall(ISDestroy(&list2));
1140   #else
1141       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
1142 
1143       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
1144 
1145       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1146 
1147       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
1148 
1149       partial_dim = fftw->partial_dim;
1150 
1151       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx1));
1152       PetscCall(PetscMalloc1((PetscInt)local_n0 * partial_dim, &indx2));
1153 
1154       if (dim[ndim - 1] % 2 == 0) NM = 2;
1155       else NM = 1;
1156 
1157       j = low;
1158       for (i = 0, k = 1; i < (PetscInt)local_n0 * partial_dim; i++, k++) {
1159         indx1[i] = local_0_start * partial_dim + i;
1160         indx2[i] = j;
1161         if (k % dim[ndim - 1] == 0) j += NM;
1162         j++;
1163       }
1164       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
1165       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
1166 
1167       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1168       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1169       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1170       PetscCall(VecScatterDestroy(&vecscat));
1171       PetscCall(ISDestroy(&list1));
1172       PetscCall(ISDestroy(&list2));
1173       PetscCall(PetscFree(indx1));
1174       PetscCall(PetscFree(indx2));
1175   #endif
1176       break;
1177     }
1178 #endif
1179   }
1180   PetscFunctionReturn(PETSC_SUCCESS);
1181 }
1182 
1183 /*MC
1184   MATFFTW -  "fftw" - Matrix type that provides FFT via the FFTW external package.
1185 
1186   Level: intermediate
1187 
1188 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatType`, `MatCreateFFT()`
1189 M*/
1190 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1191 {
1192   MPI_Comm    comm;
1193   Mat_FFT    *fft = (Mat_FFT *)A->data;
1194   Mat_FFTW   *fftw;
1195   PetscInt    ndim = fft->ndim, *dim = fft->dim;
1196   const char *plans[]  = {"FFTW_ESTIMATE", "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE"};
1197   unsigned    iplans[] = {FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE};
1198   PetscBool   flg;
1199   PetscInt    p_flag, partial_dim = 1, ctr;
1200   PetscMPIInt size, rank;
1201   ptrdiff_t  *pdim;
1202 #if !defined(PETSC_USE_COMPLEX)
1203   PetscInt tot_dim;
1204 #endif
1205 
1206   PetscFunctionBegin;
1207   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1208   PetscCallMPI(MPI_Comm_size(comm, &size));
1209   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1210 
1211 #if !PetscDefined(HAVE_MPIUNI)
1212   fftw_mpi_init();
1213 #endif
1214   pdim    = (ptrdiff_t *)calloc(ndim, sizeof(ptrdiff_t));
1215   pdim[0] = dim[0];
1216 #if !defined(PETSC_USE_COMPLEX)
1217   tot_dim = 2 * dim[0];
1218 #endif
1219   for (ctr = 1; ctr < ndim; ctr++) {
1220     partial_dim *= dim[ctr];
1221     pdim[ctr] = dim[ctr];
1222 #if !defined(PETSC_USE_COMPLEX)
1223     if (ctr == ndim - 1) tot_dim *= (dim[ctr] / 2 + 1);
1224     else tot_dim *= dim[ctr];
1225 #endif
1226   }
1227 
1228   if (size == 1) {
1229 #if defined(PETSC_USE_COMPLEX)
1230     PetscCall(MatSetSizes(A, fft->N, fft->N, fft->N, fft->N));
1231     fft->n = fft->N;
1232 #else
1233     PetscCall(MatSetSizes(A, tot_dim, tot_dim, tot_dim, tot_dim));
1234     fft->n = tot_dim;
1235 #endif
1236 #if !PetscDefined(HAVE_MPIUNI)
1237   } else {
1238     ptrdiff_t local_n0, local_0_start, local_n1, local_1_start;
1239   #if !defined(PETSC_USE_COMPLEX)
1240     ptrdiff_t temp;
1241     PetscInt  N1;
1242   #endif
1243 
1244     switch (ndim) {
1245     case 1:
1246   #if !defined(PETSC_USE_COMPLEX)
1247       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
1248   #else
1249       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1250       fft->n = (PetscInt)local_n0;
1251       PetscCall(MatSetSizes(A, local_n1, fft->n, fft->N, fft->N));
1252   #endif
1253       break;
1254     case 2:
1255   #if defined(PETSC_USE_COMPLEX)
1256       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1257       fft->n = (PetscInt)local_n0 * dim[1];
1258       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1259   #else
1260       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1261 
1262       fft->n = 2 * (PetscInt)local_n0 * (dim[1] / 2 + 1);
1263       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * (dim[1] / 2 + 1), 2 * dim[0] * (dim[1] / 2 + 1)));
1264   #endif
1265       break;
1266     case 3:
1267   #if defined(PETSC_USE_COMPLEX)
1268       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1269 
1270       fft->n = (PetscInt)local_n0 * dim[1] * dim[2];
1271       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1272   #else
1273       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1274 
1275       fft->n = 2 * (PetscInt)local_n0 * dim[1] * (dim[2] / 2 + 1);
1276       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * dim[1] * (dim[2] / 2 + 1), 2 * dim[0] * dim[1] * (dim[2] / 2 + 1)));
1277   #endif
1278       break;
1279     default:
1280   #if defined(PETSC_USE_COMPLEX)
1281       fftw_mpi_local_size(ndim, pdim, comm, &local_n0, &local_0_start);
1282 
1283       fft->n = (PetscInt)local_n0 * partial_dim;
1284       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1285   #else
1286       temp = pdim[ndim - 1];
1287 
1288       pdim[ndim - 1] = temp / 2 + 1;
1289 
1290       fftw_mpi_local_size_transposed(ndim, pdim, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1291 
1292       fft->n = 2 * (PetscInt)local_n0 * partial_dim * pdim[ndim - 1] / temp;
1293       N1     = 2 * fft->N * (PetscInt)pdim[ndim - 1] / ((PetscInt)temp);
1294 
1295       pdim[ndim - 1] = temp;
1296 
1297       PetscCall(MatSetSizes(A, fft->n, fft->n, N1, N1));
1298   #endif
1299       break;
1300     }
1301 #endif
1302   }
1303   free(pdim);
1304   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATFFTW));
1305   PetscCall(PetscNew(&fftw));
1306   fft->data = (void *)fftw;
1307 
1308   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1309   fftw->partial_dim = partial_dim;
1310 
1311   PetscCall(PetscMalloc1(ndim, &fftw->dim_fftw));
1312   if (size == 1) {
1313 #if defined(PETSC_USE_64BIT_INDICES)
1314     fftw->iodims = (fftw_iodim64 *)malloc(sizeof(fftw_iodim64) * ndim);
1315 #else
1316     fftw->iodims = (fftw_iodim *)malloc(sizeof(fftw_iodim) * ndim);
1317 #endif
1318   }
1319 
1320   for (ctr = 0; ctr < ndim; ctr++) (fftw->dim_fftw)[ctr] = dim[ctr];
1321 
1322   fftw->p_forward  = NULL;
1323   fftw->p_backward = NULL;
1324   fftw->p_flag     = FFTW_ESTIMATE;
1325 
1326   if (size == 1) {
1327     A->ops->mult          = MatMult_SeqFFTW;
1328     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1329 #if !PetscDefined(HAVE_MPIUNI)
1330   } else {
1331     A->ops->mult          = MatMult_MPIFFTW;
1332     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1333 #endif
1334   }
1335   fft->matdestroy = MatDestroy_FFTW;
1336   A->assembled    = PETSC_TRUE;
1337   A->preallocated = PETSC_TRUE;
1338 
1339   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", MatCreateVecsFFTW_FFTW));
1340   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", VecScatterPetscToFFTW_FFTW));
1341   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", VecScatterFFTWToPetsc_FFTW));
1342 
1343   /* get runtime options */
1344   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "FFTW Options", "Mat");
1345   PetscCall(PetscOptionsEList("-mat_fftw_plannerflags", "Planner Flags", "None", plans, 4, plans[0], &p_flag, &flg));
1346   if (flg) fftw->p_flag = iplans[p_flag];
1347   PetscOptionsEnd();
1348   PetscFunctionReturn(PETSC_SUCCESS);
1349 }
1350