xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision a69119a591a03a9d906b29c0a4e9802e4d7c9795)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h> /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #if !PetscDefined(HAVE_MPIUNI)
10 #include <fftw3-mpi.h>
11 #else
12 #include <fftw3.h>
13 #endif
14 EXTERN_C_END
15 
16 typedef struct {
17   ptrdiff_t ndim_fftw, *dim_fftw;
18 #if defined(PETSC_USE_64BIT_INDICES)
19   fftw_iodim64 *iodims;
20 #else
21   fftw_iodim *iodims;
22 #endif
23   PetscInt     partial_dim;
24   fftw_plan    p_forward, p_backward;
25   unsigned     p_flag;                                      /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
26   PetscScalar *finarray, *foutarray, *binarray, *boutarray; /* keep track of arrays becaue fftw plan should be
27                                                             executed for the arrays with which the plan was created */
28 } Mat_FFTW;
29 
30 extern PetscErrorCode MatMult_SeqFFTW(Mat, Vec, Vec);
31 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat, Vec, Vec);
32 extern PetscErrorCode MatDestroy_FFTW(Mat);
33 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat, Vec *, Vec *, Vec *);
34 #if !PetscDefined(HAVE_MPIUNI)
35 extern PetscErrorCode MatMult_MPIFFTW(Mat, Vec, Vec);
36 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat, Vec, Vec);
37 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
38 #endif
39 
40 /*
41    MatMult_SeqFFTW performs forward DFT
42    Input parameter:
43      A - the matrix
44      x - the vector on which FDFT will be performed
45 
46    Output parameter:
47      y - vector that stores result of FDFT
48 */
49 PetscErrorCode MatMult_SeqFFTW(Mat A, Vec x, Vec y) {
50   Mat_FFT           *fft  = (Mat_FFT *)A->data;
51   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
52   const PetscScalar *x_array;
53   PetscScalar       *y_array;
54 #if defined(PETSC_USE_COMPLEX)
55 #if defined(PETSC_USE_64BIT_INDICES)
56   fftw_iodim64 *iodims;
57 #else
58   fftw_iodim *iodims;
59 #endif
60   PetscInt i;
61 #endif
62   PetscInt ndim = fft->ndim, *dim = fft->dim;
63 
64   PetscFunctionBegin;
65   PetscCall(VecGetArrayRead(x, &x_array));
66   PetscCall(VecGetArray(y, &y_array));
67   if (!fftw->p_forward) { /* create a plan, then execute it */
68     switch (ndim) {
69     case 1:
70 #if defined(PETSC_USE_COMPLEX)
71       fftw->p_forward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
72 #else
73       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
74 #endif
75       break;
76     case 2:
77 #if defined(PETSC_USE_COMPLEX)
78       fftw->p_forward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
79 #else
80       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
81 #endif
82       break;
83     case 3:
84 #if defined(PETSC_USE_COMPLEX)
85       fftw->p_forward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
86 #else
87       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
88 #endif
89       break;
90     default:
91 #if defined(PETSC_USE_COMPLEX)
92       iodims = fftw->iodims;
93 #if defined(PETSC_USE_64BIT_INDICES)
94       if (ndim) {
95         iodims[ndim - 1].n  = (ptrdiff_t)dim[ndim - 1];
96         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
97         for (i = ndim - 2; i >= 0; --i) {
98           iodims[i].n  = (ptrdiff_t)dim[i];
99           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
100         }
101       }
102       fftw->p_forward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
103 #else
104       if (ndim) {
105         iodims[ndim - 1].n  = (int)dim[ndim - 1];
106         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
107         for (i = ndim - 2; i >= 0; --i) {
108           iodims[i].n  = (int)dim[i];
109           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
110         }
111       }
112       fftw->p_forward = fftw_plan_guru_dft((int)ndim, (fftw_iodim *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
113 #endif
114 
115 #else
116       fftw->p_forward = fftw_plan_dft_r2c(ndim, (int *)dim, (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
117 #endif
118       break;
119     }
120     fftw->finarray  = (PetscScalar *)x_array;
121     fftw->foutarray = y_array;
122     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
123                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
124     fftw_execute(fftw->p_forward);
125   } else {                                                         /* use existing plan */
126     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
127 #if defined(PETSC_USE_COMPLEX)
128       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
129 #else
130       fftw_execute_dft_r2c(fftw->p_forward, (double *)x_array, (fftw_complex *)y_array);
131 #endif
132     } else {
133       fftw_execute(fftw->p_forward);
134     }
135   }
136   PetscCall(VecRestoreArray(y, &y_array));
137   PetscCall(VecRestoreArrayRead(x, &x_array));
138   PetscFunctionReturn(0);
139 }
140 
141 /* MatMultTranspose_SeqFFTW performs serial backward DFT
142    Input parameter:
143      A - the matrix
144      x - the vector on which BDFT will be performed
145 
146    Output parameter:
147      y - vector that stores result of BDFT
148 */
149 
150 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A, Vec x, Vec y) {
151   Mat_FFT           *fft  = (Mat_FFT *)A->data;
152   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
153   const PetscScalar *x_array;
154   PetscScalar       *y_array;
155   PetscInt           ndim = fft->ndim, *dim = fft->dim;
156 #if defined(PETSC_USE_COMPLEX)
157 #if defined(PETSC_USE_64BIT_INDICES)
158   fftw_iodim64 *iodims = fftw->iodims;
159 #else
160   fftw_iodim *iodims = fftw->iodims;
161 #endif
162 #endif
163 
164   PetscFunctionBegin;
165   PetscCall(VecGetArrayRead(x, &x_array));
166   PetscCall(VecGetArray(y, &y_array));
167   if (!fftw->p_backward) { /* create a plan, then execute it */
168     switch (ndim) {
169     case 1:
170 #if defined(PETSC_USE_COMPLEX)
171       fftw->p_backward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
172 #else
173       fftw->p_backward = fftw_plan_dft_c2r_1d(dim[0], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
174 #endif
175       break;
176     case 2:
177 #if defined(PETSC_USE_COMPLEX)
178       fftw->p_backward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
179 #else
180       fftw->p_backward = fftw_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
181 #endif
182       break;
183     case 3:
184 #if defined(PETSC_USE_COMPLEX)
185       fftw->p_backward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
186 #else
187       fftw->p_backward = fftw_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
188 #endif
189       break;
190     default:
191 #if defined(PETSC_USE_COMPLEX)
192 #if defined(PETSC_USE_64BIT_INDICES)
193       fftw->p_backward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
194 #else
195       fftw->p_backward = fftw_plan_guru_dft((int)ndim, iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
196 #endif
197 #else
198       fftw->p_backward = fftw_plan_dft_c2r((int)ndim, (int *)dim, (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
199 #endif
200       break;
201     }
202     fftw->binarray  = (PetscScalar *)x_array;
203     fftw->boutarray = y_array;
204     fftw_execute(fftw->p_backward);
205   } else {                                                         /* use existing plan */
206     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
207 #if defined(PETSC_USE_COMPLEX)
208       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
209 #else
210       fftw_execute_dft_c2r(fftw->p_backward, (fftw_complex *)x_array, (double *)y_array);
211 #endif
212     } else {
213       fftw_execute(fftw->p_backward);
214     }
215   }
216   PetscCall(VecRestoreArray(y, &y_array));
217   PetscCall(VecRestoreArrayRead(x, &x_array));
218   PetscFunctionReturn(0);
219 }
220 
221 #if !PetscDefined(HAVE_MPIUNI)
222 /* MatMult_MPIFFTW performs forward DFT in parallel
223    Input parameter:
224      A - the matrix
225      x - the vector on which FDFT will be performed
226 
227    Output parameter:
228    y   - vector that stores result of FDFT
229 */
230 PetscErrorCode MatMult_MPIFFTW(Mat A, Vec x, Vec y) {
231   Mat_FFT           *fft  = (Mat_FFT *)A->data;
232   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
233   const PetscScalar *x_array;
234   PetscScalar       *y_array;
235   PetscInt           ndim = fft->ndim, *dim = fft->dim;
236   MPI_Comm           comm;
237 
238   PetscFunctionBegin;
239   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
240   PetscCall(VecGetArrayRead(x, &x_array));
241   PetscCall(VecGetArray(y, &y_array));
242   if (!fftw->p_forward) { /* create a plan, then execute it */
243     switch (ndim) {
244     case 1:
245 #if defined(PETSC_USE_COMPLEX)
246       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
247 #else
248       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
249 #endif
250       break;
251     case 2:
252 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
253       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
254 #else
255       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
256 #endif
257       break;
258     case 3:
259 #if defined(PETSC_USE_COMPLEX)
260       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
261 #else
262       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
263 #endif
264       break;
265     default:
266 #if defined(PETSC_USE_COMPLEX)
267       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
268 #else
269       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw, fftw->dim_fftw, (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
270 #endif
271       break;
272     }
273     fftw->finarray  = (PetscScalar *)x_array;
274     fftw->foutarray = y_array;
275     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
276                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
277     fftw_execute(fftw->p_forward);
278   } else {                                                         /* use existing plan */
279     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
280       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
281     } else {
282       fftw_execute(fftw->p_forward);
283     }
284   }
285   PetscCall(VecRestoreArray(y, &y_array));
286   PetscCall(VecRestoreArrayRead(x, &x_array));
287   PetscFunctionReturn(0);
288 }
289 
290 /*
291    MatMultTranspose_MPIFFTW performs parallel backward DFT
292    Input parameter:
293      A - the matrix
294      x - the vector on which BDFT will be performed
295 
296    Output parameter:
297      y - vector that stores result of BDFT
298 */
299 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A, Vec x, Vec y) {
300   Mat_FFT           *fft  = (Mat_FFT *)A->data;
301   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
302   const PetscScalar *x_array;
303   PetscScalar       *y_array;
304   PetscInt           ndim = fft->ndim, *dim = fft->dim;
305   MPI_Comm           comm;
306 
307   PetscFunctionBegin;
308   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
309   PetscCall(VecGetArrayRead(x, &x_array));
310   PetscCall(VecGetArray(y, &y_array));
311   if (!fftw->p_backward) { /* create a plan, then execute it */
312     switch (ndim) {
313     case 1:
314 #if defined(PETSC_USE_COMPLEX)
315       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
316 #else
317       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
318 #endif
319       break;
320     case 2:
321 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
322       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
323 #else
324       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
325 #endif
326       break;
327     case 3:
328 #if defined(PETSC_USE_COMPLEX)
329       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
330 #else
331       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
332 #endif
333       break;
334     default:
335 #if defined(PETSC_USE_COMPLEX)
336       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
337 #else
338       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
339 #endif
340       break;
341     }
342     fftw->binarray  = (PetscScalar *)x_array;
343     fftw->boutarray = y_array;
344     fftw_execute(fftw->p_backward);
345   } else {                                                         /* use existing plan */
346     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
347       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
348     } else {
349       fftw_execute(fftw->p_backward);
350     }
351   }
352   PetscCall(VecRestoreArray(y, &y_array));
353   PetscCall(VecRestoreArrayRead(x, &x_array));
354   PetscFunctionReturn(0);
355 }
356 #endif
357 
358 PetscErrorCode MatDestroy_FFTW(Mat A) {
359   Mat_FFT  *fft  = (Mat_FFT *)A->data;
360   Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
361 
362   PetscFunctionBegin;
363   fftw_destroy_plan(fftw->p_forward);
364   fftw_destroy_plan(fftw->p_backward);
365   if (fftw->iodims) free(fftw->iodims);
366   PetscCall(PetscFree(fftw->dim_fftw));
367   PetscCall(PetscFree(fft->data));
368   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", NULL));
369   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", NULL));
370   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", NULL));
371 #if !PetscDefined(HAVE_MPIUNI)
372   fftw_mpi_cleanup();
373 #endif
374   PetscFunctionReturn(0);
375 }
376 
377 #if !PetscDefined(HAVE_MPIUNI)
378 #include <../src/vec/vec/impls/mpi/pvecimpl.h> /*I  "petscvec.h"   I*/
379 PetscErrorCode VecDestroy_MPIFFTW(Vec v) {
380   PetscScalar *array;
381 
382   PetscFunctionBegin;
383   PetscCall(VecGetArray(v, &array));
384   fftw_free((fftw_complex *)array);
385   PetscCall(VecRestoreArray(v, &array));
386   PetscCall(VecDestroy_MPI(v));
387   PetscFunctionReturn(0);
388 }
389 #endif
390 
391 #if !PetscDefined(HAVE_MPIUNI)
392 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin, Vec *fin_new) {
393   Mat A;
394 
395   PetscFunctionBegin;
396   PetscCall(PetscObjectQuery((PetscObject)fin, "FFTmatrix", (PetscObject *)&A));
397   PetscCall(MatCreateVecsFFTW_FFTW(A, fin_new, NULL, NULL));
398   PetscFunctionReturn(0);
399 }
400 
401 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout, Vec *fout_new) {
402   Mat A;
403 
404   PetscFunctionBegin;
405   PetscCall(PetscObjectQuery((PetscObject)fout, "FFTmatrix", (PetscObject *)&A));
406   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, fout_new, NULL));
407   PetscFunctionReturn(0);
408 }
409 
410 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new) {
411   Mat A;
412 
413   PetscFunctionBegin;
414   PetscCall(PetscObjectQuery((PetscObject)bout, "FFTmatrix", (PetscObject *)&A));
415   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, NULL, bout_new));
416   PetscFunctionReturn(0);
417 }
418 #endif
419 
420 /*@
421    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
422      parallel layout determined by FFTW
423 
424    Collective on Mat
425 
426    Input Parameter:
427 .   A - the matrix
428 
429    Output Parameters:
430 +   x - (optional) input vector of forward FFTW
431 .   y - (optional) output vector of forward FFTW
432 -   z - (optional) output vector of backward FFTW
433 
434   Level: advanced
435 
436   Note: The parallel layout of output of forward FFTW is always same as the input
437         of the backward FFTW. But parallel layout of the input vector of forward
438         FFTW might not be same as the output of backward FFTW.
439         Also note that we need to provide enough space while doing parallel real transform.
440         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
441         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
442         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
443         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
444         zeros if it is odd only one zero is needed.
445         Lastly one needs some scratch space at the end of data set in each process. alloc_local
446         figures out how much space is needed, i.e. it figures out the data+scratch space for
447         each processor and returns that.
448 
449 .seealso: `MatCreateFFT()`
450 @*/
451 PetscErrorCode MatCreateVecsFFTW(Mat A, Vec *x, Vec *y, Vec *z) {
452   PetscFunctionBegin;
453   PetscUseMethod(A, "MatCreateVecsFFTW_C", (Mat, Vec *, Vec *, Vec *), (A, x, y, z));
454   PetscFunctionReturn(0);
455 }
456 
457 PetscErrorCode MatCreateVecsFFTW_FFTW(Mat A, Vec *fin, Vec *fout, Vec *bout) {
458   PetscMPIInt size, rank;
459   MPI_Comm    comm;
460   Mat_FFT    *fft = (Mat_FFT *)A->data;
461 
462   PetscFunctionBegin;
463   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
464   PetscValidType(A, 1);
465   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
466 
467   PetscCallMPI(MPI_Comm_size(comm, &size));
468   PetscCallMPI(MPI_Comm_rank(comm, &rank));
469   if (size == 1) { /* sequential case */
470 #if defined(PETSC_USE_COMPLEX)
471     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fin));
472     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fout));
473     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, bout));
474 #else
475     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fin));
476     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fout));
477     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, bout));
478 #endif
479 #if !PetscDefined(HAVE_MPIUNI)
480   } else { /* parallel cases */
481     Mat_FFTW     *fftw = (Mat_FFTW *)fft->data;
482     PetscInt      ndim = fft->ndim, *dim = fft->dim;
483     ptrdiff_t     alloc_local, local_n0, local_0_start;
484     ptrdiff_t     local_n1;
485     fftw_complex *data_fout;
486     ptrdiff_t     local_1_start;
487 #if defined(PETSC_USE_COMPLEX)
488     fftw_complex *data_fin, *data_bout;
489 #else
490     double   *data_finr, *data_boutr;
491     PetscInt  n1, N1;
492     ptrdiff_t temp;
493 #endif
494 
495     switch (ndim) {
496     case 1:
497 #if !defined(PETSC_USE_COMPLEX)
498       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not allow parallel real 1D transform");
499 #else
500       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
501       if (fin) {
502         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
503         PetscCall(VecCreateMPIWithArray(comm, 1, local_n0, fft->N, (const PetscScalar *)data_fin, fin));
504         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
505         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
506         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
507       }
508       if (fout) {
509         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
510         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_fout, fout));
511         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
512         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
513         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
514       }
515       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
516       if (bout) {
517         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
518         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_bout, bout));
519         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
520         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
521         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
522       }
523       break;
524 #endif
525     case 2:
526 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
527       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
528       N1          = 2 * dim[0] * (dim[1] / 2 + 1);
529       n1          = 2 * local_n0 * (dim[1] / 2 + 1);
530       if (fin) {
531         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
532         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
533         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
534         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
535         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
536       }
537       if (fout) {
538         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
539         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_fout, fout));
540         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
541         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
542         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
543       }
544       if (bout) {
545         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
546         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
547         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
548         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
549         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
550       }
551 #else
552       /* Get local size */
553       alloc_local = fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
554       if (fin) {
555         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
556         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
557         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
558         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
559         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
560       }
561       if (fout) {
562         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
563         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
564         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
565         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
566         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
567       }
568       if (bout) {
569         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
570         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
571         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
572         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
573         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
574       }
575 #endif
576       break;
577     case 3:
578 #if !defined(PETSC_USE_COMPLEX)
579       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
580       N1          = 2 * dim[0] * dim[1] * (dim[2] / 2 + 1);
581       n1          = 2 * local_n0 * dim[1] * (dim[2] / 2 + 1);
582       if (fin) {
583         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
584         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
585         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
586         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
587         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
588       }
589       if (fout) {
590         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
591         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (PetscScalar *)data_fout, fout));
592         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
593         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
594         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
595       }
596       if (bout) {
597         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
598         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
599         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
600         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
601         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
602       }
603 #else
604       alloc_local = fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
605       if (fin) {
606         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
607         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
608         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
609         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
610         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
611       }
612       if (fout) {
613         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
614         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
615         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
616         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
617         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
618       }
619       if (bout) {
620         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
621         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
622         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
623         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
624         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
625       }
626 #endif
627       break;
628     default:
629 #if !defined(PETSC_USE_COMPLEX)
630       temp                                  = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
631       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
632       alloc_local                           = fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
633       N1                                    = 2 * fft->N * (PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw - 1]) / ((PetscInt)temp);
634       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
635 
636       if (fin) {
637         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
638         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_finr, fin));
639         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
640         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
641         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
642       }
643       if (fout) {
644         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
645         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_fout, fout));
646         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
647         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
648         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
649       }
650       if (bout) {
651         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
652         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_boutr, bout));
653         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
654         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
655         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
656       }
657 #else
658       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
659       if (fin) {
660         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
661         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
662         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
663         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
664         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
665       }
666       if (fout) {
667         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
668         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
669         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
670         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
671         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
672       }
673       if (bout) {
674         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
675         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
676         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
677         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
678         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
679       }
680 #endif
681       break;
682     }
683     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
684        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
685        memory leaks. We void these pointers here to avoid acident uses.
686     */
687     if (fin) (*fin)->ops->replacearray = NULL;
688     if (fout) (*fout)->ops->replacearray = NULL;
689     if (bout) (*bout)->ops->replacearray = NULL;
690 #endif
691   }
692   PetscFunctionReturn(0);
693 }
694 
695 /*@
696    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
697 
698    Collective on Mat
699 
700    Input Parameters:
701 +  A - FFTW matrix
702 -  x - the PETSc vector
703 
704    Output Parameters:
705 .  y - the FFTW vector
706 
707   Options Database Keys:
708 . -mat_fftw_plannerflags - set FFTW planner flags
709 
710    Level: intermediate
711 
712    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
713          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
714          zeros. This routine does that job by scattering operation.
715 
716 .seealso: `VecScatterFFTWToPetsc()`
717 @*/
718 PetscErrorCode VecScatterPetscToFFTW(Mat A, Vec x, Vec y) {
719   PetscFunctionBegin;
720   PetscUseMethod(A, "VecScatterPetscToFFTW_C", (Mat, Vec, Vec), (A, x, y));
721   PetscFunctionReturn(0);
722 }
723 
724 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A, Vec x, Vec y) {
725   MPI_Comm    comm;
726   Mat_FFT    *fft = (Mat_FFT *)A->data;
727   PetscInt    low;
728   PetscMPIInt rank, size;
729   PetscInt    vsize, vsize1;
730   VecScatter  vecscat;
731   IS          list1;
732 
733   PetscFunctionBegin;
734   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
735   PetscCallMPI(MPI_Comm_size(comm, &size));
736   PetscCallMPI(MPI_Comm_rank(comm, &rank));
737   PetscCall(VecGetOwnershipRange(y, &low, NULL));
738 
739   if (size == 1) {
740     PetscCall(VecGetSize(x, &vsize));
741     PetscCall(VecGetSize(y, &vsize1));
742     PetscCall(ISCreateStride(PETSC_COMM_SELF, fft->N, 0, 1, &list1));
743     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
744     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
745     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
746     PetscCall(VecScatterDestroy(&vecscat));
747     PetscCall(ISDestroy(&list1));
748 #if !PetscDefined(HAVE_MPIUNI)
749   } else {
750     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
751     PetscInt  ndim = fft->ndim, *dim = fft->dim;
752     ptrdiff_t local_n0, local_0_start;
753     ptrdiff_t local_n1, local_1_start;
754     IS        list2;
755 #if !defined(PETSC_USE_COMPLEX)
756     PetscInt  i, j, k, partial_dim;
757     PetscInt *indx1, *indx2, tempindx, tempindx1;
758     PetscInt  NM;
759     ptrdiff_t temp;
760 #endif
761 
762     switch (ndim) {
763     case 1:
764 #if defined(PETSC_USE_COMPLEX)
765       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
766 
767       PetscCall(ISCreateStride(comm, local_n0, local_0_start, 1, &list1));
768       PetscCall(ISCreateStride(comm, local_n0, low, 1, &list2));
769       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
770       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
771       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
772       PetscCall(VecScatterDestroy(&vecscat));
773       PetscCall(ISDestroy(&list1));
774       PetscCall(ISDestroy(&list2));
775 #else
776       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
777 #endif
778       break;
779     case 2:
780 #if defined(PETSC_USE_COMPLEX)
781       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
782 
783       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
784       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
785       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
786       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
787       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
788       PetscCall(VecScatterDestroy(&vecscat));
789       PetscCall(ISDestroy(&list1));
790       PetscCall(ISDestroy(&list2));
791 #else
792       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
793 
794       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1));
795       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2));
796 
797       if (dim[1] % 2 == 0) {
798         NM = dim[1] + 2;
799       } else {
800         NM = dim[1] + 1;
801       }
802       for (i = 0; i < local_n0; i++) {
803         for (j = 0; j < dim[1]; j++) {
804           tempindx  = i * dim[1] + j;
805           tempindx1 = i * NM + j;
806 
807           indx1[tempindx] = local_0_start * dim[1] + tempindx;
808           indx2[tempindx] = low + tempindx1;
809         }
810       }
811 
812       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
813       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
814 
815       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
816       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
817       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
818       PetscCall(VecScatterDestroy(&vecscat));
819       PetscCall(ISDestroy(&list1));
820       PetscCall(ISDestroy(&list2));
821       PetscCall(PetscFree(indx1));
822       PetscCall(PetscFree(indx2));
823 #endif
824       break;
825 
826     case 3:
827 #if defined(PETSC_USE_COMPLEX)
828       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
829 
830       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
831       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
832       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
833       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
834       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
835       PetscCall(VecScatterDestroy(&vecscat));
836       PetscCall(ISDestroy(&list1));
837       PetscCall(ISDestroy(&list2));
838 #else
839       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
840       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 3D real transform");
841       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
842 
843       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1));
844       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2));
845 
846       if (dim[2] % 2 == 0) NM = dim[2] + 2;
847       else NM = dim[2] + 1;
848 
849       for (i = 0; i < local_n0; i++) {
850         for (j = 0; j < dim[1]; j++) {
851           for (k = 0; k < dim[2]; k++) {
852             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
853             tempindx1 = i * dim[1] * NM + j * NM + k;
854 
855             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
856             indx2[tempindx] = low + tempindx1;
857           }
858         }
859       }
860 
861       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
862       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
863       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
864       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
865       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
866       PetscCall(VecScatterDestroy(&vecscat));
867       PetscCall(ISDestroy(&list1));
868       PetscCall(ISDestroy(&list2));
869       PetscCall(PetscFree(indx1));
870       PetscCall(PetscFree(indx2));
871 #endif
872       break;
873 
874     default:
875 #if defined(PETSC_USE_COMPLEX)
876       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
877 
878       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
879       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
880       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
881       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
882       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
883       PetscCall(VecScatterDestroy(&vecscat));
884       PetscCall(ISDestroy(&list1));
885       PetscCall(ISDestroy(&list2));
886 #else
887       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
888       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel DIM>3 real transform");
889       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
890 
891       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
892 
893       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
894 
895       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
896 
897       partial_dim = fftw->partial_dim;
898 
899       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1));
900       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2));
901 
902       if (dim[ndim - 1] % 2 == 0) NM = 2;
903       else NM = 1;
904 
905       j = low;
906       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
907         indx1[i] = local_0_start * partial_dim + i;
908         indx2[i] = j;
909         if (k % dim[ndim - 1] == 0) j += NM;
910         j++;
911       }
912       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
913       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
914       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
915       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
916       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
917       PetscCall(VecScatterDestroy(&vecscat));
918       PetscCall(ISDestroy(&list1));
919       PetscCall(ISDestroy(&list2));
920       PetscCall(PetscFree(indx1));
921       PetscCall(PetscFree(indx2));
922 #endif
923       break;
924     }
925 #endif
926   }
927   PetscFunctionReturn(0);
928 }
929 
930 /*@
931    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
932 
933    Collective on Mat
934 
935     Input Parameters:
936 +   A - FFTW matrix
937 -   x - FFTW vector
938 
939    Output Parameters:
940 .  y - PETSc vector
941 
942    Level: intermediate
943 
944    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
945          VecScatterFFTWToPetsc removes those extra zeros.
946 
947 .seealso: `VecScatterPetscToFFTW()`
948 @*/
949 PetscErrorCode VecScatterFFTWToPetsc(Mat A, Vec x, Vec y) {
950   PetscFunctionBegin;
951   PetscUseMethod(A, "VecScatterFFTWToPetsc_C", (Mat, Vec, Vec), (A, x, y));
952   PetscFunctionReturn(0);
953 }
954 
955 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A, Vec x, Vec y) {
956   MPI_Comm    comm;
957   Mat_FFT    *fft = (Mat_FFT *)A->data;
958   PetscInt    low;
959   PetscMPIInt rank, size;
960   VecScatter  vecscat;
961   IS          list1;
962 
963   PetscFunctionBegin;
964   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
965   PetscCallMPI(MPI_Comm_size(comm, &size));
966   PetscCallMPI(MPI_Comm_rank(comm, &rank));
967   PetscCall(VecGetOwnershipRange(x, &low, NULL));
968 
969   if (size == 1) {
970     PetscCall(ISCreateStride(comm, fft->N, 0, 1, &list1));
971     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
972     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
973     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
974     PetscCall(VecScatterDestroy(&vecscat));
975     PetscCall(ISDestroy(&list1));
976 
977 #if !PetscDefined(HAVE_MPIUNI)
978   } else {
979     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
980     PetscInt  ndim = fft->ndim, *dim = fft->dim;
981     ptrdiff_t local_n0, local_0_start;
982     ptrdiff_t local_n1, local_1_start;
983     IS        list2;
984 #if !defined(PETSC_USE_COMPLEX)
985     PetscInt  i, j, k, partial_dim;
986     PetscInt *indx1, *indx2, tempindx, tempindx1;
987     PetscInt  NM;
988     ptrdiff_t temp;
989 #endif
990     switch (ndim) {
991     case 1:
992 #if defined(PETSC_USE_COMPLEX)
993       fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
994 
995       PetscCall(ISCreateStride(comm, local_n1, local_1_start, 1, &list1));
996       PetscCall(ISCreateStride(comm, local_n1, low, 1, &list2));
997       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
998       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
999       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1000       PetscCall(VecScatterDestroy(&vecscat));
1001       PetscCall(ISDestroy(&list1));
1002       PetscCall(ISDestroy(&list2));
1003 #else
1004       SETERRQ(comm, PETSC_ERR_SUP, "No support for real parallel 1D FFT");
1005 #endif
1006       break;
1007     case 2:
1008 #if defined(PETSC_USE_COMPLEX)
1009       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1010 
1011       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
1012       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
1013       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1014       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1015       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1016       PetscCall(VecScatterDestroy(&vecscat));
1017       PetscCall(ISDestroy(&list1));
1018       PetscCall(ISDestroy(&list2));
1019 #else
1020       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1021 
1022       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1));
1023       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2));
1024 
1025       if (dim[1] % 2 == 0) NM = dim[1] + 2;
1026       else NM = dim[1] + 1;
1027 
1028       for (i = 0; i < local_n0; i++) {
1029         for (j = 0; j < dim[1]; j++) {
1030           tempindx  = i * dim[1] + j;
1031           tempindx1 = i * NM + j;
1032 
1033           indx1[tempindx] = local_0_start * dim[1] + tempindx;
1034           indx2[tempindx] = low + tempindx1;
1035         }
1036       }
1037 
1038       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
1039       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
1040 
1041       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1042       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1043       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1044       PetscCall(VecScatterDestroy(&vecscat));
1045       PetscCall(ISDestroy(&list1));
1046       PetscCall(ISDestroy(&list2));
1047       PetscCall(PetscFree(indx1));
1048       PetscCall(PetscFree(indx2));
1049 #endif
1050       break;
1051     case 3:
1052 #if defined(PETSC_USE_COMPLEX)
1053       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1054 
1055       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
1056       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
1057       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1058       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1059       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1060       PetscCall(VecScatterDestroy(&vecscat));
1061       PetscCall(ISDestroy(&list1));
1062       PetscCall(ISDestroy(&list2));
1063 #else
1064       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1065 
1066       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1));
1067       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2));
1068 
1069       if (dim[2] % 2 == 0) NM = dim[2] + 2;
1070       else NM = dim[2] + 1;
1071 
1072       for (i = 0; i < local_n0; i++) {
1073         for (j = 0; j < dim[1]; j++) {
1074           for (k = 0; k < dim[2]; k++) {
1075             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
1076             tempindx1 = i * dim[1] * NM + j * NM + k;
1077 
1078             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
1079             indx2[tempindx] = low + tempindx1;
1080           }
1081         }
1082       }
1083 
1084       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
1085       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
1086 
1087       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1088       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1089       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1090       PetscCall(VecScatterDestroy(&vecscat));
1091       PetscCall(ISDestroy(&list1));
1092       PetscCall(ISDestroy(&list2));
1093       PetscCall(PetscFree(indx1));
1094       PetscCall(PetscFree(indx2));
1095 #endif
1096       break;
1097     default:
1098 #if defined(PETSC_USE_COMPLEX)
1099       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
1100 
1101       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
1102       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
1103       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1104       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1105       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1106       PetscCall(VecScatterDestroy(&vecscat));
1107       PetscCall(ISDestroy(&list1));
1108       PetscCall(ISDestroy(&list2));
1109 #else
1110       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
1111 
1112       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
1113 
1114       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1115 
1116       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
1117 
1118       partial_dim = fftw->partial_dim;
1119 
1120       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1));
1121       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2));
1122 
1123       if (dim[ndim - 1] % 2 == 0) NM = 2;
1124       else NM = 1;
1125 
1126       j = low;
1127       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
1128         indx1[i] = local_0_start * partial_dim + i;
1129         indx2[i] = j;
1130         if (k % dim[ndim - 1] == 0) j += NM;
1131         j++;
1132       }
1133       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
1134       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
1135 
1136       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1137       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1138       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1139       PetscCall(VecScatterDestroy(&vecscat));
1140       PetscCall(ISDestroy(&list1));
1141       PetscCall(ISDestroy(&list2));
1142       PetscCall(PetscFree(indx1));
1143       PetscCall(PetscFree(indx2));
1144 #endif
1145       break;
1146     }
1147 #endif
1148   }
1149   PetscFunctionReturn(0);
1150 }
1151 
1152 /*
1153     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1154 
1155   Options Database Keys:
1156 + -mat_fftw_plannerflags - set FFTW planner flags
1157 
1158    Level: intermediate
1159 
1160 */
1161 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A) {
1162   MPI_Comm    comm;
1163   Mat_FFT    *fft = (Mat_FFT *)A->data;
1164   Mat_FFTW   *fftw;
1165   PetscInt    ndim = fft->ndim, *dim = fft->dim;
1166   const char *plans[]  = {"FFTW_ESTIMATE", "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE"};
1167   unsigned    iplans[] = {FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE};
1168   PetscBool   flg;
1169   PetscInt    p_flag, partial_dim = 1, ctr;
1170   PetscMPIInt size, rank;
1171   ptrdiff_t  *pdim;
1172 #if !defined(PETSC_USE_COMPLEX)
1173   PetscInt tot_dim;
1174 #endif
1175 
1176   PetscFunctionBegin;
1177   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1178   PetscCallMPI(MPI_Comm_size(comm, &size));
1179   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1180 
1181 #if !PetscDefined(HAVE_MPIUNI)
1182   fftw_mpi_init();
1183 #endif
1184   pdim    = (ptrdiff_t *)calloc(ndim, sizeof(ptrdiff_t));
1185   pdim[0] = dim[0];
1186 #if !defined(PETSC_USE_COMPLEX)
1187   tot_dim = 2 * dim[0];
1188 #endif
1189   for (ctr = 1; ctr < ndim; ctr++) {
1190     partial_dim *= dim[ctr];
1191     pdim[ctr] = dim[ctr];
1192 #if !defined(PETSC_USE_COMPLEX)
1193     if (ctr == ndim - 1) tot_dim *= (dim[ctr] / 2 + 1);
1194     else tot_dim *= dim[ctr];
1195 #endif
1196   }
1197 
1198   if (size == 1) {
1199 #if defined(PETSC_USE_COMPLEX)
1200     PetscCall(MatSetSizes(A, fft->N, fft->N, fft->N, fft->N));
1201     fft->n = fft->N;
1202 #else
1203     PetscCall(MatSetSizes(A, tot_dim, tot_dim, tot_dim, tot_dim));
1204     fft->n       = tot_dim;
1205 #endif
1206 #if !PetscDefined(HAVE_MPIUNI)
1207   } else {
1208     ptrdiff_t local_n0, local_0_start, local_n1, local_1_start;
1209 #if !defined(PETSC_USE_COMPLEX)
1210     ptrdiff_t temp;
1211     PetscInt  N1;
1212 #endif
1213 
1214     switch (ndim) {
1215     case 1:
1216 #if !defined(PETSC_USE_COMPLEX)
1217       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
1218 #else
1219       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1220       fft->n = (PetscInt)local_n0;
1221       PetscCall(MatSetSizes(A, local_n1, fft->n, fft->N, fft->N));
1222 #endif
1223       break;
1224     case 2:
1225 #if defined(PETSC_USE_COMPLEX)
1226       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1227       fft->n = (PetscInt)local_n0 * dim[1];
1228       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1229 #else
1230       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1231 
1232       fft->n = 2 * (PetscInt)local_n0 * (dim[1] / 2 + 1);
1233       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * (dim[1] / 2 + 1), 2 * dim[0] * (dim[1] / 2 + 1)));
1234 #endif
1235       break;
1236     case 3:
1237 #if defined(PETSC_USE_COMPLEX)
1238       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1239 
1240       fft->n = (PetscInt)local_n0 * dim[1] * dim[2];
1241       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1242 #else
1243       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1244 
1245       fft->n = 2 * (PetscInt)local_n0 * dim[1] * (dim[2] / 2 + 1);
1246       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * dim[1] * (dim[2] / 2 + 1), 2 * dim[0] * dim[1] * (dim[2] / 2 + 1)));
1247 #endif
1248       break;
1249     default:
1250 #if defined(PETSC_USE_COMPLEX)
1251       fftw_mpi_local_size(ndim, pdim, comm, &local_n0, &local_0_start);
1252 
1253       fft->n = (PetscInt)local_n0 * partial_dim;
1254       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1255 #else
1256       temp = pdim[ndim - 1];
1257 
1258       pdim[ndim - 1] = temp / 2 + 1;
1259 
1260       fftw_mpi_local_size_transposed(ndim, pdim, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1261 
1262       fft->n = 2 * (PetscInt)local_n0 * partial_dim * pdim[ndim - 1] / temp;
1263       N1     = 2 * fft->N * (PetscInt)pdim[ndim - 1] / ((PetscInt)temp);
1264 
1265       pdim[ndim - 1] = temp;
1266 
1267       PetscCall(MatSetSizes(A, fft->n, fft->n, N1, N1));
1268 #endif
1269       break;
1270     }
1271 #endif
1272   }
1273   free(pdim);
1274   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATFFTW));
1275   PetscCall(PetscNewLog(A, &fftw));
1276   fft->data = (void *)fftw;
1277 
1278   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1279   fftw->partial_dim = partial_dim;
1280 
1281   PetscCall(PetscMalloc1(ndim, &(fftw->dim_fftw)));
1282   if (size == 1) {
1283 #if defined(PETSC_USE_64BIT_INDICES)
1284     fftw->iodims = (fftw_iodim64 *)malloc(sizeof(fftw_iodim64) * ndim);
1285 #else
1286     fftw->iodims = (fftw_iodim *)malloc(sizeof(fftw_iodim) * ndim);
1287 #endif
1288   }
1289 
1290   for (ctr = 0; ctr < ndim; ctr++) (fftw->dim_fftw)[ctr] = dim[ctr];
1291 
1292   fftw->p_forward  = NULL;
1293   fftw->p_backward = NULL;
1294   fftw->p_flag     = FFTW_ESTIMATE;
1295 
1296   if (size == 1) {
1297     A->ops->mult          = MatMult_SeqFFTW;
1298     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1299 #if !PetscDefined(HAVE_MPIUNI)
1300   } else {
1301     A->ops->mult          = MatMult_MPIFFTW;
1302     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1303 #endif
1304   }
1305   fft->matdestroy = MatDestroy_FFTW;
1306   A->assembled    = PETSC_TRUE;
1307   A->preallocated = PETSC_TRUE;
1308 
1309   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", MatCreateVecsFFTW_FFTW));
1310   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", VecScatterPetscToFFTW_FFTW));
1311   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", VecScatterFFTWToPetsc_FFTW));
1312 
1313   /* get runtime options */
1314   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "FFTW Options", "Mat");
1315   PetscCall(PetscOptionsEList("-mat_fftw_plannerflags", "Planner Flags", "None", plans, 4, plans[0], &p_flag, &flg));
1316   if (flg) fftw->p_flag = iplans[p_flag];
1317   PetscOptionsEnd();
1318   PetscFunctionReturn(0);
1319 }
1320