xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 27f49a208b01d2e827ab9db411a2d16003fe9262)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h> /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #if !PetscDefined(HAVE_MPIUNI)
10   #include <fftw3-mpi.h>
11 #else
12   #include <fftw3.h>
13 #endif
14 EXTERN_C_END
15 
16 typedef struct {
17   ptrdiff_t ndim_fftw, *dim_fftw;
18 #if defined(PETSC_USE_64BIT_INDICES)
19   fftw_iodim64 *iodims;
20 #else
21   fftw_iodim *iodims;
22 #endif
23   PetscInt     partial_dim;
24   fftw_plan    p_forward, p_backward;
25   unsigned     p_flag;                                      /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
26   PetscScalar *finarray, *foutarray, *binarray, *boutarray; /* keep track of arrays because fftw plan should be
27                                                             executed for the arrays with which the plan was created */
28 } Mat_FFTW;
29 
30 extern PetscErrorCode MatMult_SeqFFTW(Mat, Vec, Vec);
31 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat, Vec, Vec);
32 extern PetscErrorCode MatDestroy_FFTW(Mat);
33 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat, Vec *, Vec *, Vec *);
34 #if !PetscDefined(HAVE_MPIUNI)
35 extern PetscErrorCode MatMult_MPIFFTW(Mat, Vec, Vec);
36 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat, Vec, Vec);
37 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
38 #endif
39 
40 PetscErrorCode MatMult_SeqFFTW(Mat A, Vec x, Vec y)
41 {
42   Mat_FFT           *fft  = (Mat_FFT *)A->data;
43   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
44   const PetscScalar *x_array;
45   PetscScalar       *y_array;
46 #if defined(PETSC_USE_COMPLEX)
47   #if defined(PETSC_USE_64BIT_INDICES)
48   fftw_iodim64 *iodims;
49   #else
50   fftw_iodim *iodims;
51   #endif
52   PetscInt i;
53 #endif
54   PetscInt ndim = fft->ndim, *dim = fft->dim;
55 
56   PetscFunctionBegin;
57   PetscCall(VecGetArrayRead(x, &x_array));
58   PetscCall(VecGetArray(y, &y_array));
59   if (!fftw->p_forward) { /* create a plan, then execute it */
60     switch (ndim) {
61     case 1:
62 #if defined(PETSC_USE_COMPLEX)
63       fftw->p_forward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
64 #else
65       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
66 #endif
67       break;
68     case 2:
69 #if defined(PETSC_USE_COMPLEX)
70       fftw->p_forward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
71 #else
72       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
73 #endif
74       break;
75     case 3:
76 #if defined(PETSC_USE_COMPLEX)
77       fftw->p_forward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
78 #else
79       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
80 #endif
81       break;
82     default:
83 #if defined(PETSC_USE_COMPLEX)
84       iodims = fftw->iodims;
85   #if defined(PETSC_USE_64BIT_INDICES)
86       if (ndim) {
87         iodims[ndim - 1].n  = (ptrdiff_t)dim[ndim - 1];
88         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
89         for (i = ndim - 2; i >= 0; --i) {
90           iodims[i].n  = (ptrdiff_t)dim[i];
91           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
92         }
93       }
94       fftw->p_forward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
95   #else
96       if (ndim) {
97         iodims[ndim - 1].n  = (int)dim[ndim - 1];
98         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
99         for (i = ndim - 2; i >= 0; --i) {
100           iodims[i].n  = (int)dim[i];
101           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
102         }
103       }
104       fftw->p_forward = fftw_plan_guru_dft((int)ndim, (fftw_iodim *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
105   #endif
106 
107 #else
108       fftw->p_forward = fftw_plan_dft_r2c(ndim, (int *)dim, (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
109 #endif
110       break;
111     }
112     fftw->finarray  = (PetscScalar *)x_array;
113     fftw->foutarray = y_array;
114     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
115                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
116     fftw_execute(fftw->p_forward);
117   } else {                                                         /* use existing plan */
118     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
119 #if defined(PETSC_USE_COMPLEX)
120       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
121 #else
122       fftw_execute_dft_r2c(fftw->p_forward, (double *)x_array, (fftw_complex *)y_array);
123 #endif
124     } else {
125       fftw_execute(fftw->p_forward);
126     }
127   }
128   PetscCall(VecRestoreArray(y, &y_array));
129   PetscCall(VecRestoreArrayRead(x, &x_array));
130   PetscFunctionReturn(PETSC_SUCCESS);
131 }
132 
133 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A, Vec x, Vec y)
134 {
135   Mat_FFT           *fft  = (Mat_FFT *)A->data;
136   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
137   const PetscScalar *x_array;
138   PetscScalar       *y_array;
139   PetscInt           ndim = fft->ndim, *dim = fft->dim;
140 #if defined(PETSC_USE_COMPLEX)
141   #if defined(PETSC_USE_64BIT_INDICES)
142   fftw_iodim64 *iodims = fftw->iodims;
143   #else
144   fftw_iodim *iodims = fftw->iodims;
145   #endif
146 #endif
147 
148   PetscFunctionBegin;
149   PetscCall(VecGetArrayRead(x, &x_array));
150   PetscCall(VecGetArray(y, &y_array));
151   if (!fftw->p_backward) { /* create a plan, then execute it */
152     switch (ndim) {
153     case 1:
154 #if defined(PETSC_USE_COMPLEX)
155       fftw->p_backward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
156 #else
157       fftw->p_backward = fftw_plan_dft_c2r_1d(dim[0], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
158 #endif
159       break;
160     case 2:
161 #if defined(PETSC_USE_COMPLEX)
162       fftw->p_backward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
163 #else
164       fftw->p_backward = fftw_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
165 #endif
166       break;
167     case 3:
168 #if defined(PETSC_USE_COMPLEX)
169       fftw->p_backward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
170 #else
171       fftw->p_backward = fftw_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
172 #endif
173       break;
174     default:
175 #if defined(PETSC_USE_COMPLEX)
176   #if defined(PETSC_USE_64BIT_INDICES)
177       fftw->p_backward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
178   #else
179       fftw->p_backward = fftw_plan_guru_dft((int)ndim, iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
180   #endif
181 #else
182       fftw->p_backward = fftw_plan_dft_c2r((int)ndim, (int *)dim, (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
183 #endif
184       break;
185     }
186     fftw->binarray  = (PetscScalar *)x_array;
187     fftw->boutarray = y_array;
188     fftw_execute(fftw->p_backward);
189   } else {                                                         /* use existing plan */
190     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
191 #if defined(PETSC_USE_COMPLEX)
192       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
193 #else
194       fftw_execute_dft_c2r(fftw->p_backward, (fftw_complex *)x_array, (double *)y_array);
195 #endif
196     } else {
197       fftw_execute(fftw->p_backward);
198     }
199   }
200   PetscCall(VecRestoreArray(y, &y_array));
201   PetscCall(VecRestoreArrayRead(x, &x_array));
202   PetscFunctionReturn(PETSC_SUCCESS);
203 }
204 
205 #if !PetscDefined(HAVE_MPIUNI)
206 PetscErrorCode MatMult_MPIFFTW(Mat A, Vec x, Vec y)
207 {
208   Mat_FFT           *fft  = (Mat_FFT *)A->data;
209   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
210   const PetscScalar *x_array;
211   PetscScalar       *y_array;
212   PetscInt           ndim = fft->ndim, *dim = fft->dim;
213   MPI_Comm           comm;
214 
215   PetscFunctionBegin;
216   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
217   PetscCall(VecGetArrayRead(x, &x_array));
218   PetscCall(VecGetArray(y, &y_array));
219   if (!fftw->p_forward) { /* create a plan, then execute it */
220     switch (ndim) {
221     case 1:
222   #if defined(PETSC_USE_COMPLEX)
223       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
224   #else
225       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
226   #endif
227       break;
228     case 2:
229   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
230       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
231   #else
232       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
233   #endif
234       break;
235     case 3:
236   #if defined(PETSC_USE_COMPLEX)
237       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
238   #else
239       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
240   #endif
241       break;
242     default:
243   #if defined(PETSC_USE_COMPLEX)
244       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
245   #else
246       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw, fftw->dim_fftw, (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
247   #endif
248       break;
249     }
250     fftw->finarray  = (PetscScalar *)x_array;
251     fftw->foutarray = y_array;
252     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
253                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
254     fftw_execute(fftw->p_forward);
255   } else {                                                         /* use existing plan */
256     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
257       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
258     } else {
259       fftw_execute(fftw->p_forward);
260     }
261   }
262   PetscCall(VecRestoreArray(y, &y_array));
263   PetscCall(VecRestoreArrayRead(x, &x_array));
264   PetscFunctionReturn(PETSC_SUCCESS);
265 }
266 
267 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A, Vec x, Vec y)
268 {
269   Mat_FFT           *fft  = (Mat_FFT *)A->data;
270   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
271   const PetscScalar *x_array;
272   PetscScalar       *y_array;
273   PetscInt           ndim = fft->ndim, *dim = fft->dim;
274   MPI_Comm           comm;
275 
276   PetscFunctionBegin;
277   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
278   PetscCall(VecGetArrayRead(x, &x_array));
279   PetscCall(VecGetArray(y, &y_array));
280   if (!fftw->p_backward) { /* create a plan, then execute it */
281     switch (ndim) {
282     case 1:
283   #if defined(PETSC_USE_COMPLEX)
284       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
285   #else
286       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
287   #endif
288       break;
289     case 2:
290   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
291       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
292   #else
293       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
294   #endif
295       break;
296     case 3:
297   #if defined(PETSC_USE_COMPLEX)
298       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
299   #else
300       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
301   #endif
302       break;
303     default:
304   #if defined(PETSC_USE_COMPLEX)
305       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
306   #else
307       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
308   #endif
309       break;
310     }
311     fftw->binarray  = (PetscScalar *)x_array;
312     fftw->boutarray = y_array;
313     fftw_execute(fftw->p_backward);
314   } else {                                                         /* use existing plan */
315     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
316       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
317     } else {
318       fftw_execute(fftw->p_backward);
319     }
320   }
321   PetscCall(VecRestoreArray(y, &y_array));
322   PetscCall(VecRestoreArrayRead(x, &x_array));
323   PetscFunctionReturn(PETSC_SUCCESS);
324 }
325 #endif
326 
327 PetscErrorCode MatDestroy_FFTW(Mat A)
328 {
329   Mat_FFT  *fft  = (Mat_FFT *)A->data;
330   Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
331 
332   PetscFunctionBegin;
333   fftw_destroy_plan(fftw->p_forward);
334   fftw_destroy_plan(fftw->p_backward);
335   if (fftw->iodims) free(fftw->iodims);
336   PetscCall(PetscFree(fftw->dim_fftw));
337   PetscCall(PetscFree(fft->data));
338   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", NULL));
339   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", NULL));
340   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", NULL));
341 #if !PetscDefined(HAVE_MPIUNI)
342   fftw_mpi_cleanup();
343 #endif
344   PetscFunctionReturn(PETSC_SUCCESS);
345 }
346 
347 #if !PetscDefined(HAVE_MPIUNI)
348   #include <../src/vec/vec/impls/mpi/pvecimpl.h> /*I  "petscvec.h"   I*/
349 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
350 {
351   PetscScalar *array;
352 
353   PetscFunctionBegin;
354   PetscCall(VecGetArray(v, &array));
355   fftw_free((fftw_complex *)array);
356   PetscCall(VecRestoreArray(v, &array));
357   PetscCall(VecDestroy_MPI(v));
358   PetscFunctionReturn(PETSC_SUCCESS);
359 }
360 #endif
361 
362 #if !PetscDefined(HAVE_MPIUNI)
363 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin, Vec *fin_new)
364 {
365   Mat A;
366 
367   PetscFunctionBegin;
368   PetscCall(PetscObjectQuery((PetscObject)fin, "FFTmatrix", (PetscObject *)&A));
369   PetscCall(MatCreateVecsFFTW_FFTW(A, fin_new, NULL, NULL));
370   PetscFunctionReturn(PETSC_SUCCESS);
371 }
372 
373 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout, Vec *fout_new)
374 {
375   Mat A;
376 
377   PetscFunctionBegin;
378   PetscCall(PetscObjectQuery((PetscObject)fout, "FFTmatrix", (PetscObject *)&A));
379   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, fout_new, NULL));
380   PetscFunctionReturn(PETSC_SUCCESS);
381 }
382 
383 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
384 {
385   Mat A;
386 
387   PetscFunctionBegin;
388   PetscCall(PetscObjectQuery((PetscObject)bout, "FFTmatrix", (PetscObject *)&A));
389   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, NULL, bout_new));
390   PetscFunctionReturn(PETSC_SUCCESS);
391 }
392 #endif
393 
394 /*@
395    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
396      parallel layout determined by `MATFFTW`
397 
398    Collective
399 
400    Input Parameter:
401 .   A - the matrix
402 
403    Output Parameters:
404 +   x - (optional) input vector of forward FFTW
405 .   y - (optional) output vector of forward FFTW
406 -   z - (optional) output vector of backward FFTW
407 
408    Options Database Key:
409 .  -mat_fftw_plannerflags - set FFTW planner flags
410 
411   Level: advanced
412 
413   Notes:
414   The parallel layout of output of forward FFTW is always same as the input
415   of the backward FFTW. But parallel layout of the input vector of forward
416   FFTW might not be same as the output of backward FFTW.
417 
418   We need to provide enough space while doing parallel real transform.
419   We need to pad extra zeros at the end of last dimension. For this reason the one needs to
420   invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
421   last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
422   depends on if the last dimension is even or odd. If the last dimension is even need to pad two
423   zeros if it is odd only one zero is needed.
424 
425   Lastly one needs some scratch space at the end of data set in each process. alloc_local
426   figures out how much space is needed, i.e. it figures out the data+scratch space for
427   each processor and returns that.
428 
429   Developer Note:
430   How come `MatCreateVecs()` doesn't produce the correctly padded vectors automatically?
431 
432 .seealso: [](chapter_matrices), `Mat`, `MATFFTW`, `MatCreateFFT()`, `MatCreateVecs()`
433 @*/
434 PetscErrorCode MatCreateVecsFFTW(Mat A, Vec *x, Vec *y, Vec *z)
435 {
436   PetscFunctionBegin;
437   PetscUseMethod(A, "MatCreateVecsFFTW_C", (Mat, Vec *, Vec *, Vec *), (A, x, y, z));
438   PetscFunctionReturn(PETSC_SUCCESS);
439 }
440 
441 PetscErrorCode MatCreateVecsFFTW_FFTW(Mat A, Vec *fin, Vec *fout, Vec *bout)
442 {
443   PetscMPIInt size, rank;
444   MPI_Comm    comm;
445   Mat_FFT    *fft = (Mat_FFT *)A->data;
446 
447   PetscFunctionBegin;
448   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
449   PetscValidType(A, 1);
450   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
451 
452   PetscCallMPI(MPI_Comm_size(comm, &size));
453   PetscCallMPI(MPI_Comm_rank(comm, &rank));
454   if (size == 1) { /* sequential case */
455 #if defined(PETSC_USE_COMPLEX)
456     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fin));
457     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fout));
458     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, bout));
459 #else
460     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fin));
461     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fout));
462     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, bout));
463 #endif
464 #if !PetscDefined(HAVE_MPIUNI)
465   } else { /* parallel cases */
466     Mat_FFTW     *fftw = (Mat_FFTW *)fft->data;
467     PetscInt      ndim = fft->ndim, *dim = fft->dim;
468     ptrdiff_t     alloc_local, local_n0, local_0_start;
469     ptrdiff_t     local_n1;
470     fftw_complex *data_fout;
471     ptrdiff_t     local_1_start;
472   #if defined(PETSC_USE_COMPLEX)
473     fftw_complex *data_fin, *data_bout;
474   #else
475     double   *data_finr, *data_boutr;
476     PetscInt  n1, N1;
477     ptrdiff_t temp;
478   #endif
479 
480     switch (ndim) {
481     case 1:
482   #if !defined(PETSC_USE_COMPLEX)
483       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not allow parallel real 1D transform");
484   #else
485       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
486       if (fin) {
487         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
488         PetscCall(VecCreateMPIWithArray(comm, 1, local_n0, fft->N, (const PetscScalar *)data_fin, fin));
489         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
490         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
491         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
492       }
493       if (fout) {
494         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
495         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_fout, fout));
496         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
497         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
498         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
499       }
500       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
501       if (bout) {
502         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
503         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_bout, bout));
504         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
505         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
506         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
507       }
508       break;
509   #endif
510     case 2:
511   #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
512       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
513       N1          = 2 * dim[0] * (dim[1] / 2 + 1);
514       n1          = 2 * local_n0 * (dim[1] / 2 + 1);
515       if (fin) {
516         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
517         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
518         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
519         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
520         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
521       }
522       if (fout) {
523         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
524         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_fout, fout));
525         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
526         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
527         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
528       }
529       if (bout) {
530         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
531         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
532         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
533         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
534         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
535       }
536   #else
537       /* Get local size */
538       alloc_local = fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
539       if (fin) {
540         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
541         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
542         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
543         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
544         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
545       }
546       if (fout) {
547         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
548         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
549         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
550         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
551         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
552       }
553       if (bout) {
554         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
555         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
556         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
557         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
558         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
559       }
560   #endif
561       break;
562     case 3:
563   #if !defined(PETSC_USE_COMPLEX)
564       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
565       N1          = 2 * dim[0] * dim[1] * (dim[2] / 2 + 1);
566       n1          = 2 * local_n0 * dim[1] * (dim[2] / 2 + 1);
567       if (fin) {
568         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
569         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
570         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
571         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
572         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
573       }
574       if (fout) {
575         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
576         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (PetscScalar *)data_fout, fout));
577         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
578         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
579         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
580       }
581       if (bout) {
582         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
583         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
584         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
585         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
586         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
587       }
588   #else
589       alloc_local = fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
590       if (fin) {
591         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
592         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
593         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
594         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
595         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
596       }
597       if (fout) {
598         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
599         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
600         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
601         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
602         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
603       }
604       if (bout) {
605         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
606         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
607         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
608         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
609         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
610       }
611   #endif
612       break;
613     default:
614   #if !defined(PETSC_USE_COMPLEX)
615       temp                                  = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
616       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
617       alloc_local                           = fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
618       N1                                    = 2 * fft->N * (PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw - 1]) / ((PetscInt)temp);
619       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
620 
621       if (fin) {
622         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
623         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_finr, fin));
624         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
625         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
626         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
627       }
628       if (fout) {
629         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
630         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_fout, fout));
631         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
632         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
633         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
634       }
635       if (bout) {
636         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
637         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_boutr, bout));
638         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
639         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
640         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
641       }
642   #else
643       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
644       if (fin) {
645         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
646         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
647         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
648         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
649         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
650       }
651       if (fout) {
652         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
653         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
654         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
655         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
656         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
657       }
658       if (bout) {
659         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
660         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
661         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
662         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
663         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
664       }
665   #endif
666       break;
667     }
668     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
669        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
670        memory leaks. We void these pointers here to avoid accident uses.
671     */
672     if (fin) (*fin)->ops->replacearray = NULL;
673     if (fout) (*fout)->ops->replacearray = NULL;
674     if (bout) (*bout)->ops->replacearray = NULL;
675 #endif
676   }
677   PetscFunctionReturn(PETSC_SUCCESS);
678 }
679 
680 /*@
681    VecScatterPetscToFFTW - Copies a PETSc vector to the vector that goes into `MATFFTW` calls.
682 
683    Collective
684 
685    Input Parameters:
686 +  A - FFTW matrix
687 -  x - the PETSc vector
688 
689    Output Parameters:
690 .  y - the FFTW vector
691 
692    Level: intermediate
693 
694    Note:
695    For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
696    one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
697    zeros. This routine does that job by scattering operation.
698 
699 .seealso: [](chapter_matrices), `Mat`, `MATFFTW`, `VecScatterFFTWToPetsc()`, `MatCreateVecsFFTW()`
700 @*/
701 PetscErrorCode VecScatterPetscToFFTW(Mat A, Vec x, Vec y)
702 {
703   PetscFunctionBegin;
704   PetscUseMethod(A, "VecScatterPetscToFFTW_C", (Mat, Vec, Vec), (A, x, y));
705   PetscFunctionReturn(PETSC_SUCCESS);
706 }
707 
708 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A, Vec x, Vec y)
709 {
710   MPI_Comm    comm;
711   Mat_FFT    *fft = (Mat_FFT *)A->data;
712   PetscInt    low;
713   PetscMPIInt rank, size;
714   PetscInt    vsize, vsize1;
715   VecScatter  vecscat;
716   IS          list1;
717 
718   PetscFunctionBegin;
719   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
720   PetscCallMPI(MPI_Comm_size(comm, &size));
721   PetscCallMPI(MPI_Comm_rank(comm, &rank));
722   PetscCall(VecGetOwnershipRange(y, &low, NULL));
723 
724   if (size == 1) {
725     PetscCall(VecGetSize(x, &vsize));
726     PetscCall(VecGetSize(y, &vsize1));
727     PetscCall(ISCreateStride(PETSC_COMM_SELF, fft->N, 0, 1, &list1));
728     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
729     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
730     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
731     PetscCall(VecScatterDestroy(&vecscat));
732     PetscCall(ISDestroy(&list1));
733 #if !PetscDefined(HAVE_MPIUNI)
734   } else {
735     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
736     PetscInt  ndim = fft->ndim, *dim = fft->dim;
737     ptrdiff_t local_n0, local_0_start;
738     ptrdiff_t local_n1, local_1_start;
739     IS        list2;
740   #if !defined(PETSC_USE_COMPLEX)
741     PetscInt  i, j, k, partial_dim;
742     PetscInt *indx1, *indx2, tempindx, tempindx1;
743     PetscInt  NM;
744     ptrdiff_t temp;
745   #endif
746 
747     switch (ndim) {
748     case 1:
749   #if defined(PETSC_USE_COMPLEX)
750       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
751 
752       PetscCall(ISCreateStride(comm, local_n0, local_0_start, 1, &list1));
753       PetscCall(ISCreateStride(comm, local_n0, low, 1, &list2));
754       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
755       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
756       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
757       PetscCall(VecScatterDestroy(&vecscat));
758       PetscCall(ISDestroy(&list1));
759       PetscCall(ISDestroy(&list2));
760   #else
761       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
762   #endif
763       break;
764     case 2:
765   #if defined(PETSC_USE_COMPLEX)
766       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
767 
768       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
769       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
770       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
771       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
772       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
773       PetscCall(VecScatterDestroy(&vecscat));
774       PetscCall(ISDestroy(&list1));
775       PetscCall(ISDestroy(&list2));
776   #else
777       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
778 
779       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1));
780       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2));
781 
782       if (dim[1] % 2 == 0) {
783         NM = dim[1] + 2;
784       } else {
785         NM = dim[1] + 1;
786       }
787       for (i = 0; i < local_n0; i++) {
788         for (j = 0; j < dim[1]; j++) {
789           tempindx  = i * dim[1] + j;
790           tempindx1 = i * NM + j;
791 
792           indx1[tempindx] = local_0_start * dim[1] + tempindx;
793           indx2[tempindx] = low + tempindx1;
794         }
795       }
796 
797       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
798       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
799 
800       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
801       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
802       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
803       PetscCall(VecScatterDestroy(&vecscat));
804       PetscCall(ISDestroy(&list1));
805       PetscCall(ISDestroy(&list2));
806       PetscCall(PetscFree(indx1));
807       PetscCall(PetscFree(indx2));
808   #endif
809       break;
810 
811     case 3:
812   #if defined(PETSC_USE_COMPLEX)
813       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
814 
815       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
816       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
817       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
818       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
819       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
820       PetscCall(VecScatterDestroy(&vecscat));
821       PetscCall(ISDestroy(&list1));
822       PetscCall(ISDestroy(&list2));
823   #else
824       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
825       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 3D real transform");
826       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
827 
828       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1));
829       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2));
830 
831       if (dim[2] % 2 == 0) NM = dim[2] + 2;
832       else NM = dim[2] + 1;
833 
834       for (i = 0; i < local_n0; i++) {
835         for (j = 0; j < dim[1]; j++) {
836           for (k = 0; k < dim[2]; k++) {
837             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
838             tempindx1 = i * dim[1] * NM + j * NM + k;
839 
840             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
841             indx2[tempindx] = low + tempindx1;
842           }
843         }
844       }
845 
846       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
847       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
848       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
849       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
850       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
851       PetscCall(VecScatterDestroy(&vecscat));
852       PetscCall(ISDestroy(&list1));
853       PetscCall(ISDestroy(&list2));
854       PetscCall(PetscFree(indx1));
855       PetscCall(PetscFree(indx2));
856   #endif
857       break;
858 
859     default:
860   #if defined(PETSC_USE_COMPLEX)
861       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
862 
863       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
864       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
865       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
866       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
867       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
868       PetscCall(VecScatterDestroy(&vecscat));
869       PetscCall(ISDestroy(&list1));
870       PetscCall(ISDestroy(&list2));
871   #else
872       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
873       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel DIM>3 real transform");
874       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
875 
876       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
877 
878       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
879 
880       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
881 
882       partial_dim = fftw->partial_dim;
883 
884       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1));
885       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2));
886 
887       if (dim[ndim - 1] % 2 == 0) NM = 2;
888       else NM = 1;
889 
890       j = low;
891       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
892         indx1[i] = local_0_start * partial_dim + i;
893         indx2[i] = j;
894         if (k % dim[ndim - 1] == 0) j += NM;
895         j++;
896       }
897       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
898       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
899       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
900       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
901       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
902       PetscCall(VecScatterDestroy(&vecscat));
903       PetscCall(ISDestroy(&list1));
904       PetscCall(ISDestroy(&list2));
905       PetscCall(PetscFree(indx1));
906       PetscCall(PetscFree(indx2));
907   #endif
908       break;
909     }
910 #endif
911   }
912   PetscFunctionReturn(PETSC_SUCCESS);
913 }
914 
915 /*@
916    VecScatterFFTWToPetsc - Converts `MATFFTW` output vector to a PETSc vector.
917 
918    Collective
919 
920     Input Parameters:
921 +   A - `MATFFTW` matrix
922 -   x - FFTW vector
923 
924    Output Parameters:
925 .  y - PETSc vector
926 
927    Level: intermediate
928 
929    Note:
930    While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
931    `VecScatterFFTWToPetsc()` removes those extra zeros.
932 
933 .seealso: [](chapter_matrices), `Mat`, `VecScatterPetscToFFTW()`, `MATFFTW`, `MatCreateVecsFFTW()`
934 @*/
935 PetscErrorCode VecScatterFFTWToPetsc(Mat A, Vec x, Vec y)
936 {
937   PetscFunctionBegin;
938   PetscUseMethod(A, "VecScatterFFTWToPetsc_C", (Mat, Vec, Vec), (A, x, y));
939   PetscFunctionReturn(PETSC_SUCCESS);
940 }
941 
942 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A, Vec x, Vec y)
943 {
944   MPI_Comm    comm;
945   Mat_FFT    *fft = (Mat_FFT *)A->data;
946   PetscInt    low;
947   PetscMPIInt rank, size;
948   VecScatter  vecscat;
949   IS          list1;
950 
951   PetscFunctionBegin;
952   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
953   PetscCallMPI(MPI_Comm_size(comm, &size));
954   PetscCallMPI(MPI_Comm_rank(comm, &rank));
955   PetscCall(VecGetOwnershipRange(x, &low, NULL));
956 
957   if (size == 1) {
958     PetscCall(ISCreateStride(comm, fft->N, 0, 1, &list1));
959     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
960     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
961     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
962     PetscCall(VecScatterDestroy(&vecscat));
963     PetscCall(ISDestroy(&list1));
964 
965 #if !PetscDefined(HAVE_MPIUNI)
966   } else {
967     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
968     PetscInt  ndim = fft->ndim, *dim = fft->dim;
969     ptrdiff_t local_n0, local_0_start;
970     ptrdiff_t local_n1, local_1_start;
971     IS        list2;
972   #if !defined(PETSC_USE_COMPLEX)
973     PetscInt  i, j, k, partial_dim;
974     PetscInt *indx1, *indx2, tempindx, tempindx1;
975     PetscInt  NM;
976     ptrdiff_t temp;
977   #endif
978     switch (ndim) {
979     case 1:
980   #if defined(PETSC_USE_COMPLEX)
981       fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
982 
983       PetscCall(ISCreateStride(comm, local_n1, local_1_start, 1, &list1));
984       PetscCall(ISCreateStride(comm, local_n1, low, 1, &list2));
985       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
986       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
987       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
988       PetscCall(VecScatterDestroy(&vecscat));
989       PetscCall(ISDestroy(&list1));
990       PetscCall(ISDestroy(&list2));
991   #else
992       SETERRQ(comm, PETSC_ERR_SUP, "No support for real parallel 1D FFT");
993   #endif
994       break;
995     case 2:
996   #if defined(PETSC_USE_COMPLEX)
997       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
998 
999       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
1000       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
1001       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1002       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1003       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1004       PetscCall(VecScatterDestroy(&vecscat));
1005       PetscCall(ISDestroy(&list1));
1006       PetscCall(ISDestroy(&list2));
1007   #else
1008       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1009 
1010       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1));
1011       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2));
1012 
1013       if (dim[1] % 2 == 0) NM = dim[1] + 2;
1014       else NM = dim[1] + 1;
1015 
1016       for (i = 0; i < local_n0; i++) {
1017         for (j = 0; j < dim[1]; j++) {
1018           tempindx  = i * dim[1] + j;
1019           tempindx1 = i * NM + j;
1020 
1021           indx1[tempindx] = local_0_start * dim[1] + tempindx;
1022           indx2[tempindx] = low + tempindx1;
1023         }
1024       }
1025 
1026       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
1027       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
1028 
1029       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1030       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1031       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1032       PetscCall(VecScatterDestroy(&vecscat));
1033       PetscCall(ISDestroy(&list1));
1034       PetscCall(ISDestroy(&list2));
1035       PetscCall(PetscFree(indx1));
1036       PetscCall(PetscFree(indx2));
1037   #endif
1038       break;
1039     case 3:
1040   #if defined(PETSC_USE_COMPLEX)
1041       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1042 
1043       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
1044       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
1045       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1046       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1047       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1048       PetscCall(VecScatterDestroy(&vecscat));
1049       PetscCall(ISDestroy(&list1));
1050       PetscCall(ISDestroy(&list2));
1051   #else
1052       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1053 
1054       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1));
1055       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2));
1056 
1057       if (dim[2] % 2 == 0) NM = dim[2] + 2;
1058       else NM = dim[2] + 1;
1059 
1060       for (i = 0; i < local_n0; i++) {
1061         for (j = 0; j < dim[1]; j++) {
1062           for (k = 0; k < dim[2]; k++) {
1063             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
1064             tempindx1 = i * dim[1] * NM + j * NM + k;
1065 
1066             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
1067             indx2[tempindx] = low + tempindx1;
1068           }
1069         }
1070       }
1071 
1072       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
1073       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
1074 
1075       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1076       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1077       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1078       PetscCall(VecScatterDestroy(&vecscat));
1079       PetscCall(ISDestroy(&list1));
1080       PetscCall(ISDestroy(&list2));
1081       PetscCall(PetscFree(indx1));
1082       PetscCall(PetscFree(indx2));
1083   #endif
1084       break;
1085     default:
1086   #if defined(PETSC_USE_COMPLEX)
1087       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
1088 
1089       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
1090       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
1091       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1092       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1093       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1094       PetscCall(VecScatterDestroy(&vecscat));
1095       PetscCall(ISDestroy(&list1));
1096       PetscCall(ISDestroy(&list2));
1097   #else
1098       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
1099 
1100       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
1101 
1102       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1103 
1104       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
1105 
1106       partial_dim = fftw->partial_dim;
1107 
1108       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1));
1109       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2));
1110 
1111       if (dim[ndim - 1] % 2 == 0) NM = 2;
1112       else NM = 1;
1113 
1114       j = low;
1115       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
1116         indx1[i] = local_0_start * partial_dim + i;
1117         indx2[i] = j;
1118         if (k % dim[ndim - 1] == 0) j += NM;
1119         j++;
1120       }
1121       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
1122       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
1123 
1124       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1125       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1126       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1127       PetscCall(VecScatterDestroy(&vecscat));
1128       PetscCall(ISDestroy(&list1));
1129       PetscCall(ISDestroy(&list2));
1130       PetscCall(PetscFree(indx1));
1131       PetscCall(PetscFree(indx2));
1132   #endif
1133       break;
1134     }
1135 #endif
1136   }
1137   PetscFunctionReturn(PETSC_SUCCESS);
1138 }
1139 
1140 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1141 {
1142   MPI_Comm    comm;
1143   Mat_FFT    *fft = (Mat_FFT *)A->data;
1144   Mat_FFTW   *fftw;
1145   PetscInt    ndim = fft->ndim, *dim = fft->dim;
1146   const char *plans[]  = {"FFTW_ESTIMATE", "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE"};
1147   unsigned    iplans[] = {FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE};
1148   PetscBool   flg;
1149   PetscInt    p_flag, partial_dim = 1, ctr;
1150   PetscMPIInt size, rank;
1151   ptrdiff_t  *pdim;
1152 #if !defined(PETSC_USE_COMPLEX)
1153   PetscInt tot_dim;
1154 #endif
1155 
1156   PetscFunctionBegin;
1157   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1158   PetscCallMPI(MPI_Comm_size(comm, &size));
1159   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1160 
1161 #if !PetscDefined(HAVE_MPIUNI)
1162   fftw_mpi_init();
1163 #endif
1164   pdim    = (ptrdiff_t *)calloc(ndim, sizeof(ptrdiff_t));
1165   pdim[0] = dim[0];
1166 #if !defined(PETSC_USE_COMPLEX)
1167   tot_dim = 2 * dim[0];
1168 #endif
1169   for (ctr = 1; ctr < ndim; ctr++) {
1170     partial_dim *= dim[ctr];
1171     pdim[ctr] = dim[ctr];
1172 #if !defined(PETSC_USE_COMPLEX)
1173     if (ctr == ndim - 1) tot_dim *= (dim[ctr] / 2 + 1);
1174     else tot_dim *= dim[ctr];
1175 #endif
1176   }
1177 
1178   if (size == 1) {
1179 #if defined(PETSC_USE_COMPLEX)
1180     PetscCall(MatSetSizes(A, fft->N, fft->N, fft->N, fft->N));
1181     fft->n = fft->N;
1182 #else
1183     PetscCall(MatSetSizes(A, tot_dim, tot_dim, tot_dim, tot_dim));
1184     fft->n       = tot_dim;
1185 #endif
1186 #if !PetscDefined(HAVE_MPIUNI)
1187   } else {
1188     ptrdiff_t local_n0, local_0_start, local_n1, local_1_start;
1189   #if !defined(PETSC_USE_COMPLEX)
1190     ptrdiff_t temp;
1191     PetscInt  N1;
1192   #endif
1193 
1194     switch (ndim) {
1195     case 1:
1196   #if !defined(PETSC_USE_COMPLEX)
1197       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
1198   #else
1199       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1200       fft->n = (PetscInt)local_n0;
1201       PetscCall(MatSetSizes(A, local_n1, fft->n, fft->N, fft->N));
1202   #endif
1203       break;
1204     case 2:
1205   #if defined(PETSC_USE_COMPLEX)
1206       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1207       fft->n = (PetscInt)local_n0 * dim[1];
1208       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1209   #else
1210       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1211 
1212       fft->n = 2 * (PetscInt)local_n0 * (dim[1] / 2 + 1);
1213       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * (dim[1] / 2 + 1), 2 * dim[0] * (dim[1] / 2 + 1)));
1214   #endif
1215       break;
1216     case 3:
1217   #if defined(PETSC_USE_COMPLEX)
1218       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1219 
1220       fft->n = (PetscInt)local_n0 * dim[1] * dim[2];
1221       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1222   #else
1223       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1224 
1225       fft->n = 2 * (PetscInt)local_n0 * dim[1] * (dim[2] / 2 + 1);
1226       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * dim[1] * (dim[2] / 2 + 1), 2 * dim[0] * dim[1] * (dim[2] / 2 + 1)));
1227   #endif
1228       break;
1229     default:
1230   #if defined(PETSC_USE_COMPLEX)
1231       fftw_mpi_local_size(ndim, pdim, comm, &local_n0, &local_0_start);
1232 
1233       fft->n = (PetscInt)local_n0 * partial_dim;
1234       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1235   #else
1236       temp = pdim[ndim - 1];
1237 
1238       pdim[ndim - 1] = temp / 2 + 1;
1239 
1240       fftw_mpi_local_size_transposed(ndim, pdim, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1241 
1242       fft->n = 2 * (PetscInt)local_n0 * partial_dim * pdim[ndim - 1] / temp;
1243       N1     = 2 * fft->N * (PetscInt)pdim[ndim - 1] / ((PetscInt)temp);
1244 
1245       pdim[ndim - 1] = temp;
1246 
1247       PetscCall(MatSetSizes(A, fft->n, fft->n, N1, N1));
1248   #endif
1249       break;
1250     }
1251 #endif
1252   }
1253   free(pdim);
1254   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATFFTW));
1255   PetscCall(PetscNew(&fftw));
1256   fft->data = (void *)fftw;
1257 
1258   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1259   fftw->partial_dim = partial_dim;
1260 
1261   PetscCall(PetscMalloc1(ndim, &(fftw->dim_fftw)));
1262   if (size == 1) {
1263 #if defined(PETSC_USE_64BIT_INDICES)
1264     fftw->iodims = (fftw_iodim64 *)malloc(sizeof(fftw_iodim64) * ndim);
1265 #else
1266     fftw->iodims = (fftw_iodim *)malloc(sizeof(fftw_iodim) * ndim);
1267 #endif
1268   }
1269 
1270   for (ctr = 0; ctr < ndim; ctr++) (fftw->dim_fftw)[ctr] = dim[ctr];
1271 
1272   fftw->p_forward  = NULL;
1273   fftw->p_backward = NULL;
1274   fftw->p_flag     = FFTW_ESTIMATE;
1275 
1276   if (size == 1) {
1277     A->ops->mult          = MatMult_SeqFFTW;
1278     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1279 #if !PetscDefined(HAVE_MPIUNI)
1280   } else {
1281     A->ops->mult          = MatMult_MPIFFTW;
1282     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1283 #endif
1284   }
1285   fft->matdestroy = MatDestroy_FFTW;
1286   A->assembled    = PETSC_TRUE;
1287   A->preallocated = PETSC_TRUE;
1288 
1289   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", MatCreateVecsFFTW_FFTW));
1290   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", VecScatterPetscToFFTW_FFTW));
1291   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", VecScatterFFTWToPetsc_FFTW));
1292 
1293   /* get runtime options */
1294   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "FFTW Options", "Mat");
1295   PetscCall(PetscOptionsEList("-mat_fftw_plannerflags", "Planner Flags", "None", plans, 4, plans[0], &p_flag, &flg));
1296   if (flg) fftw->p_flag = iplans[p_flag];
1297   PetscOptionsEnd();
1298   PetscFunctionReturn(PETSC_SUCCESS);
1299 }
1300