xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 4d86920da9ee67c835173a5dfffa1b3a52fd24ca)
1 /*
2     Provides an interface to the FFTW package.
3     Testing examples can be found in ~src/mat/tests
4 */
5 
6 #include <../src/mat/impls/fft/fft.h> /*I "petscmat.h" I*/
7 EXTERN_C_BEGIN
8 #if !PetscDefined(HAVE_MPIUNI)
9   #include <fftw3-mpi.h>
10 #else
11   #include <fftw3.h>
12 #endif
13 EXTERN_C_END
14 
15 typedef struct {
16   ptrdiff_t ndim_fftw, *dim_fftw;
17 #if defined(PETSC_USE_64BIT_INDICES)
18   fftw_iodim64 *iodims;
19 #else
20   fftw_iodim *iodims;
21 #endif
22   PetscInt     partial_dim;
23   fftw_plan    p_forward, p_backward;
24   unsigned     p_flag;                                      /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
25   PetscScalar *finarray, *foutarray, *binarray, *boutarray; /* keep track of arrays because fftw plan should be
26                                                             executed for the arrays with which the plan was created */
27 } Mat_FFTW;
28 
29 extern PetscErrorCode MatMult_SeqFFTW(Mat, Vec, Vec);
30 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat, Vec, Vec);
31 extern PetscErrorCode MatDestroy_FFTW(Mat);
32 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat, Vec *, Vec *, Vec *);
33 #if !PetscDefined(HAVE_MPIUNI)
34 extern PetscErrorCode MatMult_MPIFFTW(Mat, Vec, Vec);
35 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat, Vec, Vec);
36 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
37 #endif
38 
39 PetscErrorCode MatMult_SeqFFTW(Mat A, Vec x, Vec y)
40 {
41   Mat_FFT           *fft  = (Mat_FFT *)A->data;
42   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
43   const PetscScalar *x_array;
44   PetscScalar       *y_array;
45 #if defined(PETSC_USE_COMPLEX)
46   #if defined(PETSC_USE_64BIT_INDICES)
47   fftw_iodim64 *iodims;
48   #else
49   fftw_iodim *iodims;
50   #endif
51   PetscInt i;
52 #endif
53   PetscInt ndim = fft->ndim, *dim = fft->dim;
54 
55   PetscFunctionBegin;
56   PetscCall(VecGetArrayRead(x, &x_array));
57   PetscCall(VecGetArray(y, &y_array));
58   if (!fftw->p_forward) { /* create a plan, then execute it */
59     switch (ndim) {
60     case 1:
61 #if defined(PETSC_USE_COMPLEX)
62       fftw->p_forward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
63 #else
64       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
65 #endif
66       break;
67     case 2:
68 #if defined(PETSC_USE_COMPLEX)
69       fftw->p_forward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
70 #else
71       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
72 #endif
73       break;
74     case 3:
75 #if defined(PETSC_USE_COMPLEX)
76       fftw->p_forward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
77 #else
78       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
79 #endif
80       break;
81     default:
82 #if defined(PETSC_USE_COMPLEX)
83       iodims = fftw->iodims;
84   #if defined(PETSC_USE_64BIT_INDICES)
85       if (ndim) {
86         iodims[ndim - 1].n  = (ptrdiff_t)dim[ndim - 1];
87         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
88         for (i = ndim - 2; i >= 0; --i) {
89           iodims[i].n  = (ptrdiff_t)dim[i];
90           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
91         }
92       }
93       fftw->p_forward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
94   #else
95       if (ndim) {
96         iodims[ndim - 1].n  = (int)dim[ndim - 1];
97         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
98         for (i = ndim - 2; i >= 0; --i) {
99           iodims[i].n  = (int)dim[i];
100           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
101         }
102       }
103       fftw->p_forward = fftw_plan_guru_dft((int)ndim, (fftw_iodim *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
104   #endif
105 
106 #else
107       fftw->p_forward = fftw_plan_dft_r2c(ndim, (int *)dim, (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
108 #endif
109       break;
110     }
111     fftw->finarray  = (PetscScalar *)x_array;
112     fftw->foutarray = y_array;
113     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
114                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
115     fftw_execute(fftw->p_forward);
116   } else {                                                         /* use existing plan */
117     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
118 #if defined(PETSC_USE_COMPLEX)
119       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
120 #else
121       fftw_execute_dft_r2c(fftw->p_forward, (double *)x_array, (fftw_complex *)y_array);
122 #endif
123     } else {
124       fftw_execute(fftw->p_forward);
125     }
126   }
127   PetscCall(VecRestoreArray(y, &y_array));
128   PetscCall(VecRestoreArrayRead(x, &x_array));
129   PetscFunctionReturn(PETSC_SUCCESS);
130 }
131 
132 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A, Vec x, Vec y)
133 {
134   Mat_FFT           *fft  = (Mat_FFT *)A->data;
135   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
136   const PetscScalar *x_array;
137   PetscScalar       *y_array;
138   PetscInt           ndim = fft->ndim, *dim = fft->dim;
139 #if defined(PETSC_USE_COMPLEX)
140   #if defined(PETSC_USE_64BIT_INDICES)
141   fftw_iodim64 *iodims = fftw->iodims;
142   #else
143   fftw_iodim *iodims = fftw->iodims;
144   #endif
145 #endif
146 
147   PetscFunctionBegin;
148   PetscCall(VecGetArrayRead(x, &x_array));
149   PetscCall(VecGetArray(y, &y_array));
150   if (!fftw->p_backward) { /* create a plan, then execute it */
151     switch (ndim) {
152     case 1:
153 #if defined(PETSC_USE_COMPLEX)
154       fftw->p_backward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
155 #else
156       fftw->p_backward = fftw_plan_dft_c2r_1d(dim[0], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
157 #endif
158       break;
159     case 2:
160 #if defined(PETSC_USE_COMPLEX)
161       fftw->p_backward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
162 #else
163       fftw->p_backward = fftw_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
164 #endif
165       break;
166     case 3:
167 #if defined(PETSC_USE_COMPLEX)
168       fftw->p_backward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
169 #else
170       fftw->p_backward = fftw_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
171 #endif
172       break;
173     default:
174 #if defined(PETSC_USE_COMPLEX)
175   #if defined(PETSC_USE_64BIT_INDICES)
176       fftw->p_backward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
177   #else
178       fftw->p_backward = fftw_plan_guru_dft((int)ndim, iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
179   #endif
180 #else
181       fftw->p_backward = fftw_plan_dft_c2r((int)ndim, (int *)dim, (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
182 #endif
183       break;
184     }
185     fftw->binarray  = (PetscScalar *)x_array;
186     fftw->boutarray = y_array;
187     fftw_execute(fftw->p_backward);
188   } else {                                                         /* use existing plan */
189     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
190 #if defined(PETSC_USE_COMPLEX)
191       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
192 #else
193       fftw_execute_dft_c2r(fftw->p_backward, (fftw_complex *)x_array, (double *)y_array);
194 #endif
195     } else {
196       fftw_execute(fftw->p_backward);
197     }
198   }
199   PetscCall(VecRestoreArray(y, &y_array));
200   PetscCall(VecRestoreArrayRead(x, &x_array));
201   PetscFunctionReturn(PETSC_SUCCESS);
202 }
203 
204 #if !PetscDefined(HAVE_MPIUNI)
205 PetscErrorCode MatMult_MPIFFTW(Mat A, Vec x, Vec y)
206 {
207   Mat_FFT           *fft  = (Mat_FFT *)A->data;
208   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
209   const PetscScalar *x_array;
210   PetscScalar       *y_array;
211   PetscInt           ndim = fft->ndim, *dim = fft->dim;
212   MPI_Comm           comm;
213 
214   PetscFunctionBegin;
215   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
216   PetscCall(VecGetArrayRead(x, &x_array));
217   PetscCall(VecGetArray(y, &y_array));
218   if (!fftw->p_forward) { /* create a plan, then execute it */
219     switch (ndim) {
220     case 1:
221   #if defined(PETSC_USE_COMPLEX)
222       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
223   #else
224       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
225   #endif
226       break;
227     case 2:
228   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
229       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
230   #else
231       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
232   #endif
233       break;
234     case 3:
235   #if defined(PETSC_USE_COMPLEX)
236       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
237   #else
238       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
239   #endif
240       break;
241     default:
242   #if defined(PETSC_USE_COMPLEX)
243       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
244   #else
245       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw, fftw->dim_fftw, (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
246   #endif
247       break;
248     }
249     fftw->finarray  = (PetscScalar *)x_array;
250     fftw->foutarray = y_array;
251     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
252                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
253     fftw_execute(fftw->p_forward);
254   } else {                                                         /* use existing plan */
255     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
256       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
257     } else {
258       fftw_execute(fftw->p_forward);
259     }
260   }
261   PetscCall(VecRestoreArray(y, &y_array));
262   PetscCall(VecRestoreArrayRead(x, &x_array));
263   PetscFunctionReturn(PETSC_SUCCESS);
264 }
265 
266 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A, Vec x, Vec y)
267 {
268   Mat_FFT           *fft  = (Mat_FFT *)A->data;
269   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
270   const PetscScalar *x_array;
271   PetscScalar       *y_array;
272   PetscInt           ndim = fft->ndim, *dim = fft->dim;
273   MPI_Comm           comm;
274 
275   PetscFunctionBegin;
276   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
277   PetscCall(VecGetArrayRead(x, &x_array));
278   PetscCall(VecGetArray(y, &y_array));
279   if (!fftw->p_backward) { /* create a plan, then execute it */
280     switch (ndim) {
281     case 1:
282   #if defined(PETSC_USE_COMPLEX)
283       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
284   #else
285       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
286   #endif
287       break;
288     case 2:
289   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
290       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
291   #else
292       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
293   #endif
294       break;
295     case 3:
296   #if defined(PETSC_USE_COMPLEX)
297       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
298   #else
299       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
300   #endif
301       break;
302     default:
303   #if defined(PETSC_USE_COMPLEX)
304       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
305   #else
306       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
307   #endif
308       break;
309     }
310     fftw->binarray  = (PetscScalar *)x_array;
311     fftw->boutarray = y_array;
312     fftw_execute(fftw->p_backward);
313   } else {                                                         /* use existing plan */
314     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
315       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
316     } else {
317       fftw_execute(fftw->p_backward);
318     }
319   }
320   PetscCall(VecRestoreArray(y, &y_array));
321   PetscCall(VecRestoreArrayRead(x, &x_array));
322   PetscFunctionReturn(PETSC_SUCCESS);
323 }
324 #endif
325 
326 PetscErrorCode MatDestroy_FFTW(Mat A)
327 {
328   Mat_FFT  *fft  = (Mat_FFT *)A->data;
329   Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
330 
331   PetscFunctionBegin;
332   fftw_destroy_plan(fftw->p_forward);
333   fftw_destroy_plan(fftw->p_backward);
334   if (fftw->iodims) free(fftw->iodims);
335   PetscCall(PetscFree(fftw->dim_fftw));
336   PetscCall(PetscFree(fft->data));
337   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", NULL));
338   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", NULL));
339   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", NULL));
340 #if !PetscDefined(HAVE_MPIUNI)
341   fftw_mpi_cleanup();
342 #endif
343   PetscFunctionReturn(PETSC_SUCCESS);
344 }
345 
346 #if !PetscDefined(HAVE_MPIUNI)
347   #include <../src/vec/vec/impls/mpi/pvecimpl.h> /*I  "petscvec.h"   I*/
348 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
349 {
350   PetscScalar *array;
351 
352   PetscFunctionBegin;
353   PetscCall(VecGetArray(v, &array));
354   fftw_free((fftw_complex *)array);
355   PetscCall(VecRestoreArray(v, &array));
356   PetscCall(VecDestroy_MPI(v));
357   PetscFunctionReturn(PETSC_SUCCESS);
358 }
359 #endif
360 
361 #if !PetscDefined(HAVE_MPIUNI)
362 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin, Vec *fin_new)
363 {
364   Mat A;
365 
366   PetscFunctionBegin;
367   PetscCall(PetscObjectQuery((PetscObject)fin, "FFTmatrix", (PetscObject *)&A));
368   PetscCall(MatCreateVecsFFTW_FFTW(A, fin_new, NULL, NULL));
369   PetscFunctionReturn(PETSC_SUCCESS);
370 }
371 
372 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout, Vec *fout_new)
373 {
374   Mat A;
375 
376   PetscFunctionBegin;
377   PetscCall(PetscObjectQuery((PetscObject)fout, "FFTmatrix", (PetscObject *)&A));
378   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, fout_new, NULL));
379   PetscFunctionReturn(PETSC_SUCCESS);
380 }
381 
382 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
383 {
384   Mat A;
385 
386   PetscFunctionBegin;
387   PetscCall(PetscObjectQuery((PetscObject)bout, "FFTmatrix", (PetscObject *)&A));
388   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, NULL, bout_new));
389   PetscFunctionReturn(PETSC_SUCCESS);
390 }
391 #endif
392 
393 /*@
394   MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
395   parallel layout determined by `MATFFTW`
396 
397   Collective
398 
399   Input Parameter:
400 . A - the matrix
401 
402   Output Parameters:
403 + x - (optional) input vector of forward FFTW
404 . y - (optional) output vector of forward FFTW
405 - z - (optional) output vector of backward FFTW
406 
407   Options Database Key:
408 . -mat_fftw_plannerflags - set FFTW planner flags
409 
410   Level: advanced
411 
412   Notes:
413   The parallel layout of output of forward FFTW is always same as the input
414   of the backward FFTW. But parallel layout of the input vector of forward
415   FFTW might not be same as the output of backward FFTW.
416 
417   We need to provide enough space while doing parallel real transform.
418   We need to pad extra zeros at the end of last dimension. For this reason the one needs to
419   invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
420   last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
421   depends on if the last dimension is even or odd. If the last dimension is even need to pad two
422   zeros if it is odd only one zero is needed.
423 
424   Lastly one needs some scratch space at the end of data set in each process. alloc_local
425   figures out how much space is needed, i.e. it figures out the data+scratch space for
426   each processor and returns that.
427 
428   Developer Notes:
429   How come `MatCreateVecs()` doesn't produce the correctly padded vectors automatically?
430 
431 .seealso: [](ch_matrices), `Mat`, `MATFFTW`, `MatCreateFFT()`, `MatCreateVecs()`
432 @*/
433 PetscErrorCode MatCreateVecsFFTW(Mat A, Vec *x, Vec *y, Vec *z)
434 {
435   PetscFunctionBegin;
436   PetscUseMethod(A, "MatCreateVecsFFTW_C", (Mat, Vec *, Vec *, Vec *), (A, x, y, z));
437   PetscFunctionReturn(PETSC_SUCCESS);
438 }
439 
440 PetscErrorCode MatCreateVecsFFTW_FFTW(Mat A, Vec *fin, Vec *fout, Vec *bout)
441 {
442   PetscMPIInt size, rank;
443   MPI_Comm    comm;
444   Mat_FFT    *fft = (Mat_FFT *)A->data;
445 
446   PetscFunctionBegin;
447   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
448   PetscValidType(A, 1);
449   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
450 
451   PetscCallMPI(MPI_Comm_size(comm, &size));
452   PetscCallMPI(MPI_Comm_rank(comm, &rank));
453   if (size == 1) { /* sequential case */
454 #if defined(PETSC_USE_COMPLEX)
455     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fin));
456     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fout));
457     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, bout));
458 #else
459     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fin));
460     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fout));
461     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, bout));
462 #endif
463 #if !PetscDefined(HAVE_MPIUNI)
464   } else { /* parallel cases */
465     Mat_FFTW     *fftw = (Mat_FFTW *)fft->data;
466     PetscInt      ndim = fft->ndim, *dim = fft->dim;
467     ptrdiff_t     alloc_local, local_n0, local_0_start;
468     ptrdiff_t     local_n1;
469     fftw_complex *data_fout;
470     ptrdiff_t     local_1_start;
471   #if defined(PETSC_USE_COMPLEX)
472     fftw_complex *data_fin, *data_bout;
473   #else
474     double   *data_finr, *data_boutr;
475     PetscInt  n1, N1;
476     ptrdiff_t temp;
477   #endif
478 
479     switch (ndim) {
480     case 1:
481   #if !defined(PETSC_USE_COMPLEX)
482       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not allow parallel real 1D transform");
483   #else
484       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
485       if (fin) {
486         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
487         PetscCall(VecCreateMPIWithArray(comm, 1, local_n0, fft->N, (const PetscScalar *)data_fin, fin));
488         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
489         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
490         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
491       }
492       if (fout) {
493         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
494         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_fout, fout));
495         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
496         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
497         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
498       }
499       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
500       if (bout) {
501         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
502         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_bout, bout));
503         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
504         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
505         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
506       }
507       break;
508   #endif
509     case 2:
510   #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
511       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
512       N1          = 2 * dim[0] * (dim[1] / 2 + 1);
513       n1          = 2 * local_n0 * (dim[1] / 2 + 1);
514       if (fin) {
515         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
516         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
517         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
518         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
519         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
520       }
521       if (fout) {
522         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
523         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_fout, fout));
524         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
525         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
526         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
527       }
528       if (bout) {
529         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
530         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
531         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
532         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
533         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
534       }
535   #else
536       /* Get local size */
537       alloc_local = fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
538       if (fin) {
539         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
540         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
541         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
542         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
543         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
544       }
545       if (fout) {
546         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
547         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
548         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
549         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
550         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
551       }
552       if (bout) {
553         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
554         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
555         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
556         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
557         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
558       }
559   #endif
560       break;
561     case 3:
562   #if !defined(PETSC_USE_COMPLEX)
563       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
564       N1          = 2 * dim[0] * dim[1] * (dim[2] / 2 + 1);
565       n1          = 2 * local_n0 * dim[1] * (dim[2] / 2 + 1);
566       if (fin) {
567         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
568         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
569         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
570         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
571         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
572       }
573       if (fout) {
574         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
575         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (PetscScalar *)data_fout, fout));
576         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
577         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
578         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
579       }
580       if (bout) {
581         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
582         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
583         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
584         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
585         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
586       }
587   #else
588       alloc_local = fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
589       if (fin) {
590         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
591         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
592         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
593         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
594         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
595       }
596       if (fout) {
597         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
598         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
599         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
600         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
601         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
602       }
603       if (bout) {
604         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
605         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
606         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
607         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
608         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
609       }
610   #endif
611       break;
612     default:
613   #if !defined(PETSC_USE_COMPLEX)
614       temp                                  = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
615       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
616       alloc_local                           = fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
617       N1                                    = 2 * fft->N * (PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw - 1]) / ((PetscInt)temp);
618       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
619 
620       if (fin) {
621         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
622         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_finr, fin));
623         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
624         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
625         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
626       }
627       if (fout) {
628         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
629         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_fout, fout));
630         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
631         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
632         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
633       }
634       if (bout) {
635         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
636         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_boutr, bout));
637         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
638         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
639         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
640       }
641   #else
642       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
643       if (fin) {
644         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
645         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
646         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
647         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
648         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
649       }
650       if (fout) {
651         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
652         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
653         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
654         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
655         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
656       }
657       if (bout) {
658         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
659         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
660         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
661         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
662         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
663       }
664   #endif
665       break;
666     }
667     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
668        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
669        memory leaks. We void these pointers here to avoid accident uses.
670     */
671     if (fin) (*fin)->ops->replacearray = NULL;
672     if (fout) (*fout)->ops->replacearray = NULL;
673     if (bout) (*bout)->ops->replacearray = NULL;
674 #endif
675   }
676   PetscFunctionReturn(PETSC_SUCCESS);
677 }
678 
679 /*@
680   VecScatterPetscToFFTW - Copies a PETSc vector to the vector that goes into `MATFFTW` calls.
681 
682   Collective
683 
684   Input Parameters:
685 + A - FFTW matrix
686 - x - the PETSc vector
687 
688   Output Parameter:
689 . y - the FFTW vector
690 
691   Level: intermediate
692 
693   Note:
694   For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
695   one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
696   zeros. This routine does that job by scattering operation.
697 
698 .seealso: [](ch_matrices), `Mat`, `MATFFTW`, `VecScatterFFTWToPetsc()`, `MatCreateVecsFFTW()`
699 @*/
700 PetscErrorCode VecScatterPetscToFFTW(Mat A, Vec x, Vec y)
701 {
702   PetscFunctionBegin;
703   PetscUseMethod(A, "VecScatterPetscToFFTW_C", (Mat, Vec, Vec), (A, x, y));
704   PetscFunctionReturn(PETSC_SUCCESS);
705 }
706 
707 static PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A, Vec x, Vec y)
708 {
709   MPI_Comm    comm;
710   Mat_FFT    *fft = (Mat_FFT *)A->data;
711   PetscInt    low;
712   PetscMPIInt rank, size;
713   PetscInt    vsize, vsize1;
714   VecScatter  vecscat;
715   IS          list1;
716 
717   PetscFunctionBegin;
718   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
719   PetscCallMPI(MPI_Comm_size(comm, &size));
720   PetscCallMPI(MPI_Comm_rank(comm, &rank));
721   PetscCall(VecGetOwnershipRange(y, &low, NULL));
722 
723   if (size == 1) {
724     PetscCall(VecGetSize(x, &vsize));
725     PetscCall(VecGetSize(y, &vsize1));
726     PetscCall(ISCreateStride(PETSC_COMM_SELF, fft->N, 0, 1, &list1));
727     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
728     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
729     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
730     PetscCall(VecScatterDestroy(&vecscat));
731     PetscCall(ISDestroy(&list1));
732 #if !PetscDefined(HAVE_MPIUNI)
733   } else {
734     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
735     PetscInt  ndim = fft->ndim, *dim = fft->dim;
736     ptrdiff_t local_n0, local_0_start;
737     ptrdiff_t local_n1, local_1_start;
738     IS        list2;
739   #if !defined(PETSC_USE_COMPLEX)
740     PetscInt  i, j, k, partial_dim;
741     PetscInt *indx1, *indx2, tempindx, tempindx1;
742     PetscInt  NM;
743     ptrdiff_t temp;
744   #endif
745 
746     switch (ndim) {
747     case 1:
748   #if defined(PETSC_USE_COMPLEX)
749       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
750 
751       PetscCall(ISCreateStride(comm, local_n0, local_0_start, 1, &list1));
752       PetscCall(ISCreateStride(comm, local_n0, low, 1, &list2));
753       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
754       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
755       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
756       PetscCall(VecScatterDestroy(&vecscat));
757       PetscCall(ISDestroy(&list1));
758       PetscCall(ISDestroy(&list2));
759   #else
760       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
761   #endif
762       break;
763     case 2:
764   #if defined(PETSC_USE_COMPLEX)
765       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
766 
767       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
768       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
769       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
770       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
771       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
772       PetscCall(VecScatterDestroy(&vecscat));
773       PetscCall(ISDestroy(&list1));
774       PetscCall(ISDestroy(&list2));
775   #else
776       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
777 
778       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1));
779       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2));
780 
781       if (dim[1] % 2 == 0) {
782         NM = dim[1] + 2;
783       } else {
784         NM = dim[1] + 1;
785       }
786       for (i = 0; i < local_n0; i++) {
787         for (j = 0; j < dim[1]; j++) {
788           tempindx  = i * dim[1] + j;
789           tempindx1 = i * NM + j;
790 
791           indx1[tempindx] = local_0_start * dim[1] + tempindx;
792           indx2[tempindx] = low + tempindx1;
793         }
794       }
795 
796       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
797       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
798 
799       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
800       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
801       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
802       PetscCall(VecScatterDestroy(&vecscat));
803       PetscCall(ISDestroy(&list1));
804       PetscCall(ISDestroy(&list2));
805       PetscCall(PetscFree(indx1));
806       PetscCall(PetscFree(indx2));
807   #endif
808       break;
809 
810     case 3:
811   #if defined(PETSC_USE_COMPLEX)
812       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
813 
814       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
815       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
816       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
817       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
818       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
819       PetscCall(VecScatterDestroy(&vecscat));
820       PetscCall(ISDestroy(&list1));
821       PetscCall(ISDestroy(&list2));
822   #else
823       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
824       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 3D real transform");
825       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
826 
827       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1));
828       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2));
829 
830       if (dim[2] % 2 == 0) NM = dim[2] + 2;
831       else NM = dim[2] + 1;
832 
833       for (i = 0; i < local_n0; i++) {
834         for (j = 0; j < dim[1]; j++) {
835           for (k = 0; k < dim[2]; k++) {
836             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
837             tempindx1 = i * dim[1] * NM + j * NM + k;
838 
839             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
840             indx2[tempindx] = low + tempindx1;
841           }
842         }
843       }
844 
845       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
846       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
847       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
848       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
849       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
850       PetscCall(VecScatterDestroy(&vecscat));
851       PetscCall(ISDestroy(&list1));
852       PetscCall(ISDestroy(&list2));
853       PetscCall(PetscFree(indx1));
854       PetscCall(PetscFree(indx2));
855   #endif
856       break;
857 
858     default:
859   #if defined(PETSC_USE_COMPLEX)
860       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
861 
862       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
863       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
864       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
865       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
866       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
867       PetscCall(VecScatterDestroy(&vecscat));
868       PetscCall(ISDestroy(&list1));
869       PetscCall(ISDestroy(&list2));
870   #else
871       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
872       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel DIM>3 real transform");
873       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
874 
875       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
876 
877       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
878 
879       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
880 
881       partial_dim = fftw->partial_dim;
882 
883       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1));
884       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2));
885 
886       if (dim[ndim - 1] % 2 == 0) NM = 2;
887       else NM = 1;
888 
889       j = low;
890       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
891         indx1[i] = local_0_start * partial_dim + i;
892         indx2[i] = j;
893         if (k % dim[ndim - 1] == 0) j += NM;
894         j++;
895       }
896       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
897       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
898       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
899       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
900       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
901       PetscCall(VecScatterDestroy(&vecscat));
902       PetscCall(ISDestroy(&list1));
903       PetscCall(ISDestroy(&list2));
904       PetscCall(PetscFree(indx1));
905       PetscCall(PetscFree(indx2));
906   #endif
907       break;
908     }
909 #endif
910   }
911   PetscFunctionReturn(PETSC_SUCCESS);
912 }
913 
914 /*@
915   VecScatterFFTWToPetsc - Converts `MATFFTW` output vector to a PETSc vector.
916 
917   Collective
918 
919   Input Parameters:
920 + A - `MATFFTW` matrix
921 - x - FFTW vector
922 
923   Output Parameter:
924 . y - PETSc vector
925 
926   Level: intermediate
927 
928   Note:
929   While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
930   `VecScatterFFTWToPetsc()` removes those extra zeros.
931 
932 .seealso: [](ch_matrices), `Mat`, `VecScatterPetscToFFTW()`, `MATFFTW`, `MatCreateVecsFFTW()`
933 @*/
934 PetscErrorCode VecScatterFFTWToPetsc(Mat A, Vec x, Vec y)
935 {
936   PetscFunctionBegin;
937   PetscUseMethod(A, "VecScatterFFTWToPetsc_C", (Mat, Vec, Vec), (A, x, y));
938   PetscFunctionReturn(PETSC_SUCCESS);
939 }
940 
941 static PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A, Vec x, Vec y)
942 {
943   MPI_Comm    comm;
944   Mat_FFT    *fft = (Mat_FFT *)A->data;
945   PetscInt    low;
946   PetscMPIInt rank, size;
947   VecScatter  vecscat;
948   IS          list1;
949 
950   PetscFunctionBegin;
951   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
952   PetscCallMPI(MPI_Comm_size(comm, &size));
953   PetscCallMPI(MPI_Comm_rank(comm, &rank));
954   PetscCall(VecGetOwnershipRange(x, &low, NULL));
955 
956   if (size == 1) {
957     PetscCall(ISCreateStride(comm, fft->N, 0, 1, &list1));
958     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
959     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
960     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
961     PetscCall(VecScatterDestroy(&vecscat));
962     PetscCall(ISDestroy(&list1));
963 
964 #if !PetscDefined(HAVE_MPIUNI)
965   } else {
966     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
967     PetscInt  ndim = fft->ndim, *dim = fft->dim;
968     ptrdiff_t local_n0, local_0_start;
969     ptrdiff_t local_n1, local_1_start;
970     IS        list2;
971   #if !defined(PETSC_USE_COMPLEX)
972     PetscInt  i, j, k, partial_dim;
973     PetscInt *indx1, *indx2, tempindx, tempindx1;
974     PetscInt  NM;
975     ptrdiff_t temp;
976   #endif
977     switch (ndim) {
978     case 1:
979   #if defined(PETSC_USE_COMPLEX)
980       fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
981 
982       PetscCall(ISCreateStride(comm, local_n1, local_1_start, 1, &list1));
983       PetscCall(ISCreateStride(comm, local_n1, low, 1, &list2));
984       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
985       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
986       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
987       PetscCall(VecScatterDestroy(&vecscat));
988       PetscCall(ISDestroy(&list1));
989       PetscCall(ISDestroy(&list2));
990   #else
991       SETERRQ(comm, PETSC_ERR_SUP, "No support for real parallel 1D FFT");
992   #endif
993       break;
994     case 2:
995   #if defined(PETSC_USE_COMPLEX)
996       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
997 
998       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
999       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
1000       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1001       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1002       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1003       PetscCall(VecScatterDestroy(&vecscat));
1004       PetscCall(ISDestroy(&list1));
1005       PetscCall(ISDestroy(&list2));
1006   #else
1007       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1008 
1009       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1));
1010       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2));
1011 
1012       if (dim[1] % 2 == 0) NM = dim[1] + 2;
1013       else NM = dim[1] + 1;
1014 
1015       for (i = 0; i < local_n0; i++) {
1016         for (j = 0; j < dim[1]; j++) {
1017           tempindx  = i * dim[1] + j;
1018           tempindx1 = i * NM + j;
1019 
1020           indx1[tempindx] = local_0_start * dim[1] + tempindx;
1021           indx2[tempindx] = low + tempindx1;
1022         }
1023       }
1024 
1025       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
1026       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
1027 
1028       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1029       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1030       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1031       PetscCall(VecScatterDestroy(&vecscat));
1032       PetscCall(ISDestroy(&list1));
1033       PetscCall(ISDestroy(&list2));
1034       PetscCall(PetscFree(indx1));
1035       PetscCall(PetscFree(indx2));
1036   #endif
1037       break;
1038     case 3:
1039   #if defined(PETSC_USE_COMPLEX)
1040       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1041 
1042       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
1043       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
1044       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1045       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1046       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1047       PetscCall(VecScatterDestroy(&vecscat));
1048       PetscCall(ISDestroy(&list1));
1049       PetscCall(ISDestroy(&list2));
1050   #else
1051       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1052 
1053       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1));
1054       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2));
1055 
1056       if (dim[2] % 2 == 0) NM = dim[2] + 2;
1057       else NM = dim[2] + 1;
1058 
1059       for (i = 0; i < local_n0; i++) {
1060         for (j = 0; j < dim[1]; j++) {
1061           for (k = 0; k < dim[2]; k++) {
1062             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
1063             tempindx1 = i * dim[1] * NM + j * NM + k;
1064 
1065             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
1066             indx2[tempindx] = low + tempindx1;
1067           }
1068         }
1069       }
1070 
1071       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
1072       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
1073 
1074       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1075       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1076       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1077       PetscCall(VecScatterDestroy(&vecscat));
1078       PetscCall(ISDestroy(&list1));
1079       PetscCall(ISDestroy(&list2));
1080       PetscCall(PetscFree(indx1));
1081       PetscCall(PetscFree(indx2));
1082   #endif
1083       break;
1084     default:
1085   #if defined(PETSC_USE_COMPLEX)
1086       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
1087 
1088       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
1089       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
1090       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1091       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1092       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1093       PetscCall(VecScatterDestroy(&vecscat));
1094       PetscCall(ISDestroy(&list1));
1095       PetscCall(ISDestroy(&list2));
1096   #else
1097       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
1098 
1099       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
1100 
1101       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1102 
1103       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
1104 
1105       partial_dim = fftw->partial_dim;
1106 
1107       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1));
1108       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2));
1109 
1110       if (dim[ndim - 1] % 2 == 0) NM = 2;
1111       else NM = 1;
1112 
1113       j = low;
1114       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
1115         indx1[i] = local_0_start * partial_dim + i;
1116         indx2[i] = j;
1117         if (k % dim[ndim - 1] == 0) j += NM;
1118         j++;
1119       }
1120       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
1121       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
1122 
1123       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1124       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1125       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1126       PetscCall(VecScatterDestroy(&vecscat));
1127       PetscCall(ISDestroy(&list1));
1128       PetscCall(ISDestroy(&list2));
1129       PetscCall(PetscFree(indx1));
1130       PetscCall(PetscFree(indx2));
1131   #endif
1132       break;
1133     }
1134 #endif
1135   }
1136   PetscFunctionReturn(PETSC_SUCCESS);
1137 }
1138 
1139 /*MC
1140   MATFFTW -  "fftw" - Matrix type that provides FFT via the FFTW external package.
1141 
1142   Level: intermediate
1143 
1144 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatType`, `MatCreateFFT()`
1145 M*/
1146 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1147 {
1148   MPI_Comm    comm;
1149   Mat_FFT    *fft = (Mat_FFT *)A->data;
1150   Mat_FFTW   *fftw;
1151   PetscInt    ndim = fft->ndim, *dim = fft->dim;
1152   const char *plans[]  = {"FFTW_ESTIMATE", "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE"};
1153   unsigned    iplans[] = {FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE};
1154   PetscBool   flg;
1155   PetscInt    p_flag, partial_dim = 1, ctr;
1156   PetscMPIInt size, rank;
1157   ptrdiff_t  *pdim;
1158 #if !defined(PETSC_USE_COMPLEX)
1159   PetscInt tot_dim;
1160 #endif
1161 
1162   PetscFunctionBegin;
1163   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1164   PetscCallMPI(MPI_Comm_size(comm, &size));
1165   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1166 
1167 #if !PetscDefined(HAVE_MPIUNI)
1168   fftw_mpi_init();
1169 #endif
1170   pdim    = (ptrdiff_t *)calloc(ndim, sizeof(ptrdiff_t));
1171   pdim[0] = dim[0];
1172 #if !defined(PETSC_USE_COMPLEX)
1173   tot_dim = 2 * dim[0];
1174 #endif
1175   for (ctr = 1; ctr < ndim; ctr++) {
1176     partial_dim *= dim[ctr];
1177     pdim[ctr] = dim[ctr];
1178 #if !defined(PETSC_USE_COMPLEX)
1179     if (ctr == ndim - 1) tot_dim *= (dim[ctr] / 2 + 1);
1180     else tot_dim *= dim[ctr];
1181 #endif
1182   }
1183 
1184   if (size == 1) {
1185 #if defined(PETSC_USE_COMPLEX)
1186     PetscCall(MatSetSizes(A, fft->N, fft->N, fft->N, fft->N));
1187     fft->n = fft->N;
1188 #else
1189     PetscCall(MatSetSizes(A, tot_dim, tot_dim, tot_dim, tot_dim));
1190     fft->n       = tot_dim;
1191 #endif
1192 #if !PetscDefined(HAVE_MPIUNI)
1193   } else {
1194     ptrdiff_t local_n0, local_0_start, local_n1, local_1_start;
1195   #if !defined(PETSC_USE_COMPLEX)
1196     ptrdiff_t temp;
1197     PetscInt  N1;
1198   #endif
1199 
1200     switch (ndim) {
1201     case 1:
1202   #if !defined(PETSC_USE_COMPLEX)
1203       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
1204   #else
1205       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1206       fft->n = (PetscInt)local_n0;
1207       PetscCall(MatSetSizes(A, local_n1, fft->n, fft->N, fft->N));
1208   #endif
1209       break;
1210     case 2:
1211   #if defined(PETSC_USE_COMPLEX)
1212       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1213       fft->n = (PetscInt)local_n0 * dim[1];
1214       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1215   #else
1216       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1217 
1218       fft->n = 2 * (PetscInt)local_n0 * (dim[1] / 2 + 1);
1219       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * (dim[1] / 2 + 1), 2 * dim[0] * (dim[1] / 2 + 1)));
1220   #endif
1221       break;
1222     case 3:
1223   #if defined(PETSC_USE_COMPLEX)
1224       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1225 
1226       fft->n = (PetscInt)local_n0 * dim[1] * dim[2];
1227       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1228   #else
1229       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1230 
1231       fft->n = 2 * (PetscInt)local_n0 * dim[1] * (dim[2] / 2 + 1);
1232       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * dim[1] * (dim[2] / 2 + 1), 2 * dim[0] * dim[1] * (dim[2] / 2 + 1)));
1233   #endif
1234       break;
1235     default:
1236   #if defined(PETSC_USE_COMPLEX)
1237       fftw_mpi_local_size(ndim, pdim, comm, &local_n0, &local_0_start);
1238 
1239       fft->n = (PetscInt)local_n0 * partial_dim;
1240       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1241   #else
1242       temp = pdim[ndim - 1];
1243 
1244       pdim[ndim - 1] = temp / 2 + 1;
1245 
1246       fftw_mpi_local_size_transposed(ndim, pdim, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1247 
1248       fft->n = 2 * (PetscInt)local_n0 * partial_dim * pdim[ndim - 1] / temp;
1249       N1     = 2 * fft->N * (PetscInt)pdim[ndim - 1] / ((PetscInt)temp);
1250 
1251       pdim[ndim - 1] = temp;
1252 
1253       PetscCall(MatSetSizes(A, fft->n, fft->n, N1, N1));
1254   #endif
1255       break;
1256     }
1257 #endif
1258   }
1259   free(pdim);
1260   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATFFTW));
1261   PetscCall(PetscNew(&fftw));
1262   fft->data = (void *)fftw;
1263 
1264   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1265   fftw->partial_dim = partial_dim;
1266 
1267   PetscCall(PetscMalloc1(ndim, &(fftw->dim_fftw)));
1268   if (size == 1) {
1269 #if defined(PETSC_USE_64BIT_INDICES)
1270     fftw->iodims = (fftw_iodim64 *)malloc(sizeof(fftw_iodim64) * ndim);
1271 #else
1272     fftw->iodims = (fftw_iodim *)malloc(sizeof(fftw_iodim) * ndim);
1273 #endif
1274   }
1275 
1276   for (ctr = 0; ctr < ndim; ctr++) (fftw->dim_fftw)[ctr] = dim[ctr];
1277 
1278   fftw->p_forward  = NULL;
1279   fftw->p_backward = NULL;
1280   fftw->p_flag     = FFTW_ESTIMATE;
1281 
1282   if (size == 1) {
1283     A->ops->mult          = MatMult_SeqFFTW;
1284     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1285 #if !PetscDefined(HAVE_MPIUNI)
1286   } else {
1287     A->ops->mult          = MatMult_MPIFFTW;
1288     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1289 #endif
1290   }
1291   fft->matdestroy = MatDestroy_FFTW;
1292   A->assembled    = PETSC_TRUE;
1293   A->preallocated = PETSC_TRUE;
1294 
1295   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", MatCreateVecsFFTW_FFTW));
1296   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", VecScatterPetscToFFTW_FFTW));
1297   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", VecScatterFFTWToPetsc_FFTW));
1298 
1299   /* get runtime options */
1300   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "FFTW Options", "Mat");
1301   PetscCall(PetscOptionsEList("-mat_fftw_plannerflags", "Planner Flags", "None", plans, 4, plans[0], &p_flag, &flg));
1302   if (flg) fftw->p_flag = iplans[p_flag];
1303   PetscOptionsEnd();
1304   PetscFunctionReturn(PETSC_SUCCESS);
1305 }
1306