xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 2d30e087755efd99e28fdfe792ffbeb2ee1ea928)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h> /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #if !PetscDefined(HAVE_MPIUNI)
10 #include <fftw3-mpi.h>
11 #else
12 #include <fftw3.h>
13 #endif
14 EXTERN_C_END
15 
16 typedef struct {
17   ptrdiff_t ndim_fftw, *dim_fftw;
18 #if defined(PETSC_USE_64BIT_INDICES)
19   fftw_iodim64 *iodims;
20 #else
21   fftw_iodim *iodims;
22 #endif
23   PetscInt     partial_dim;
24   fftw_plan    p_forward, p_backward;
25   unsigned     p_flag;                                      /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
26   PetscScalar *finarray, *foutarray, *binarray, *boutarray; /* keep track of arrays becaue fftw plan should be
27                                                             executed for the arrays with which the plan was created */
28 } Mat_FFTW;
29 
30 extern PetscErrorCode MatMult_SeqFFTW(Mat, Vec, Vec);
31 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat, Vec, Vec);
32 extern PetscErrorCode MatDestroy_FFTW(Mat);
33 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat, Vec *, Vec *, Vec *);
34 #if !PetscDefined(HAVE_MPIUNI)
35 extern PetscErrorCode MatMult_MPIFFTW(Mat, Vec, Vec);
36 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat, Vec, Vec);
37 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
38 #endif
39 
40 /*
41    MatMult_SeqFFTW performs forward DFT
42    Input parameter:
43      A - the matrix
44      x - the vector on which FDFT will be performed
45 
46    Output parameter:
47      y - vector that stores result of FDFT
48 */
49 PetscErrorCode MatMult_SeqFFTW(Mat A, Vec x, Vec y) {
50   Mat_FFT           *fft  = (Mat_FFT *)A->data;
51   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
52   const PetscScalar *x_array;
53   PetscScalar       *y_array;
54 #if defined(PETSC_USE_COMPLEX)
55 #if defined(PETSC_USE_64BIT_INDICES)
56   fftw_iodim64 *iodims;
57 #else
58   fftw_iodim *iodims;
59 #endif
60   PetscInt i;
61 #endif
62   PetscInt ndim = fft->ndim, *dim = fft->dim;
63 
64   PetscFunctionBegin;
65   PetscCall(VecGetArrayRead(x, &x_array));
66   PetscCall(VecGetArray(y, &y_array));
67   if (!fftw->p_forward) { /* create a plan, then execute it */
68     switch (ndim) {
69     case 1:
70 #if defined(PETSC_USE_COMPLEX)
71       fftw->p_forward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
72 #else
73       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
74 #endif
75       break;
76     case 2:
77 #if defined(PETSC_USE_COMPLEX)
78       fftw->p_forward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
79 #else
80       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
81 #endif
82       break;
83     case 3:
84 #if defined(PETSC_USE_COMPLEX)
85       fftw->p_forward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
86 #else
87       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
88 #endif
89       break;
90     default:
91 #if defined(PETSC_USE_COMPLEX)
92       iodims = fftw->iodims;
93 #if defined(PETSC_USE_64BIT_INDICES)
94       if (ndim) {
95         iodims[ndim - 1].n  = (ptrdiff_t)dim[ndim - 1];
96         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
97         for (i = ndim - 2; i >= 0; --i) {
98           iodims[i].n  = (ptrdiff_t)dim[i];
99           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
100         }
101       }
102       fftw->p_forward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
103 #else
104       if (ndim) {
105         iodims[ndim - 1].n  = (int)dim[ndim - 1];
106         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
107         for (i = ndim - 2; i >= 0; --i) {
108           iodims[i].n  = (int)dim[i];
109           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
110         }
111       }
112       fftw->p_forward = fftw_plan_guru_dft((int)ndim, (fftw_iodim *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
113 #endif
114 
115 #else
116       fftw->p_forward = fftw_plan_dft_r2c(ndim, (int *)dim, (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
117 #endif
118       break;
119     }
120     fftw->finarray  = (PetscScalar *)x_array;
121     fftw->foutarray = y_array;
122     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
123                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
124     fftw_execute(fftw->p_forward);
125   } else {                                                         /* use existing plan */
126     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
127 #if defined(PETSC_USE_COMPLEX)
128       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
129 #else
130       fftw_execute_dft_r2c(fftw->p_forward, (double *)x_array, (fftw_complex *)y_array);
131 #endif
132     } else {
133       fftw_execute(fftw->p_forward);
134     }
135   }
136   PetscCall(VecRestoreArray(y, &y_array));
137   PetscCall(VecRestoreArrayRead(x, &x_array));
138   PetscFunctionReturn(0);
139 }
140 
141 /* MatMultTranspose_SeqFFTW performs serial backward DFT
142    Input parameter:
143      A - the matrix
144      x - the vector on which BDFT will be performed
145 
146    Output parameter:
147      y - vector that stores result of BDFT
148 */
149 
150 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A, Vec x, Vec y) {
151   Mat_FFT           *fft  = (Mat_FFT *)A->data;
152   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
153   const PetscScalar *x_array;
154   PetscScalar       *y_array;
155   PetscInt           ndim = fft->ndim, *dim = fft->dim;
156 #if defined(PETSC_USE_COMPLEX)
157 #if defined(PETSC_USE_64BIT_INDICES)
158   fftw_iodim64 *iodims = fftw->iodims;
159 #else
160   fftw_iodim *iodims = fftw->iodims;
161 #endif
162 #endif
163 
164   PetscFunctionBegin;
165   PetscCall(VecGetArrayRead(x, &x_array));
166   PetscCall(VecGetArray(y, &y_array));
167   if (!fftw->p_backward) { /* create a plan, then execute it */
168     switch (ndim) {
169     case 1:
170 #if defined(PETSC_USE_COMPLEX)
171       fftw->p_backward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
172 #else
173       fftw->p_backward = fftw_plan_dft_c2r_1d(dim[0], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
174 #endif
175       break;
176     case 2:
177 #if defined(PETSC_USE_COMPLEX)
178       fftw->p_backward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
179 #else
180       fftw->p_backward = fftw_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
181 #endif
182       break;
183     case 3:
184 #if defined(PETSC_USE_COMPLEX)
185       fftw->p_backward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
186 #else
187       fftw->p_backward = fftw_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
188 #endif
189       break;
190     default:
191 #if defined(PETSC_USE_COMPLEX)
192 #if defined(PETSC_USE_64BIT_INDICES)
193       fftw->p_backward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
194 #else
195       fftw->p_backward = fftw_plan_guru_dft((int)ndim, iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
196 #endif
197 #else
198       fftw->p_backward = fftw_plan_dft_c2r((int)ndim, (int *)dim, (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
199 #endif
200       break;
201     }
202     fftw->binarray  = (PetscScalar *)x_array;
203     fftw->boutarray = y_array;
204     fftw_execute(fftw->p_backward);
205   } else {                                                         /* use existing plan */
206     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
207 #if defined(PETSC_USE_COMPLEX)
208       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
209 #else
210       fftw_execute_dft_c2r(fftw->p_backward, (fftw_complex *)x_array, (double *)y_array);
211 #endif
212     } else {
213       fftw_execute(fftw->p_backward);
214     }
215   }
216   PetscCall(VecRestoreArray(y, &y_array));
217   PetscCall(VecRestoreArrayRead(x, &x_array));
218   PetscFunctionReturn(0);
219 }
220 
221 #if !PetscDefined(HAVE_MPIUNI)
222 /* MatMult_MPIFFTW performs forward DFT in parallel
223    Input parameter:
224      A - the matrix
225      x - the vector on which FDFT will be performed
226 
227    Output parameter:
228    y   - vector that stores result of FDFT
229 */
230 PetscErrorCode MatMult_MPIFFTW(Mat A, Vec x, Vec y) {
231   Mat_FFT           *fft  = (Mat_FFT *)A->data;
232   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
233   const PetscScalar *x_array;
234   PetscScalar       *y_array;
235   PetscInt           ndim = fft->ndim, *dim = fft->dim;
236   MPI_Comm           comm;
237 
238   PetscFunctionBegin;
239   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
240   PetscCall(VecGetArrayRead(x, &x_array));
241   PetscCall(VecGetArray(y, &y_array));
242   if (!fftw->p_forward) { /* create a plan, then execute it */
243     switch (ndim) {
244     case 1:
245 #if defined(PETSC_USE_COMPLEX)
246       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
247 #else
248       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
249 #endif
250       break;
251     case 2:
252 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
253       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
254 #else
255       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
256 #endif
257       break;
258     case 3:
259 #if defined(PETSC_USE_COMPLEX)
260       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
261 #else
262       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
263 #endif
264       break;
265     default:
266 #if defined(PETSC_USE_COMPLEX)
267       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
268 #else
269       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw, fftw->dim_fftw, (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
270 #endif
271       break;
272     }
273     fftw->finarray  = (PetscScalar *)x_array;
274     fftw->foutarray = y_array;
275     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
276                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
277     fftw_execute(fftw->p_forward);
278   } else {                                                         /* use existing plan */
279     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
280       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
281     } else {
282       fftw_execute(fftw->p_forward);
283     }
284   }
285   PetscCall(VecRestoreArray(y, &y_array));
286   PetscCall(VecRestoreArrayRead(x, &x_array));
287   PetscFunctionReturn(0);
288 }
289 
290 /*
291    MatMultTranspose_MPIFFTW performs parallel backward DFT
292    Input parameter:
293      A - the matrix
294      x - the vector on which BDFT will be performed
295 
296    Output parameter:
297      y - vector that stores result of BDFT
298 */
299 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A, Vec x, Vec y) {
300   Mat_FFT           *fft  = (Mat_FFT *)A->data;
301   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
302   const PetscScalar *x_array;
303   PetscScalar       *y_array;
304   PetscInt           ndim = fft->ndim, *dim = fft->dim;
305   MPI_Comm           comm;
306 
307   PetscFunctionBegin;
308   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
309   PetscCall(VecGetArrayRead(x, &x_array));
310   PetscCall(VecGetArray(y, &y_array));
311   if (!fftw->p_backward) { /* create a plan, then execute it */
312     switch (ndim) {
313     case 1:
314 #if defined(PETSC_USE_COMPLEX)
315       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
316 #else
317       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
318 #endif
319       break;
320     case 2:
321 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
322       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
323 #else
324       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
325 #endif
326       break;
327     case 3:
328 #if defined(PETSC_USE_COMPLEX)
329       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
330 #else
331       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
332 #endif
333       break;
334     default:
335 #if defined(PETSC_USE_COMPLEX)
336       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
337 #else
338       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
339 #endif
340       break;
341     }
342     fftw->binarray  = (PetscScalar *)x_array;
343     fftw->boutarray = y_array;
344     fftw_execute(fftw->p_backward);
345   } else {                                                         /* use existing plan */
346     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
347       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
348     } else {
349       fftw_execute(fftw->p_backward);
350     }
351   }
352   PetscCall(VecRestoreArray(y, &y_array));
353   PetscCall(VecRestoreArrayRead(x, &x_array));
354   PetscFunctionReturn(0);
355 }
356 #endif
357 
358 PetscErrorCode MatDestroy_FFTW(Mat A) {
359   Mat_FFT  *fft  = (Mat_FFT *)A->data;
360   Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
361 
362   PetscFunctionBegin;
363   fftw_destroy_plan(fftw->p_forward);
364   fftw_destroy_plan(fftw->p_backward);
365   if (fftw->iodims) free(fftw->iodims);
366   PetscCall(PetscFree(fftw->dim_fftw));
367   PetscCall(PetscFree(fft->data));
368   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", NULL));
369   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", NULL));
370   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", NULL));
371 #if !PetscDefined(HAVE_MPIUNI)
372   fftw_mpi_cleanup();
373 #endif
374   PetscFunctionReturn(0);
375 }
376 
377 #if !PetscDefined(HAVE_MPIUNI)
378 #include <../src/vec/vec/impls/mpi/pvecimpl.h> /*I  "petscvec.h"   I*/
379 PetscErrorCode VecDestroy_MPIFFTW(Vec v) {
380   PetscScalar *array;
381 
382   PetscFunctionBegin;
383   PetscCall(VecGetArray(v, &array));
384   fftw_free((fftw_complex *)array);
385   PetscCall(VecRestoreArray(v, &array));
386   PetscCall(VecDestroy_MPI(v));
387   PetscFunctionReturn(0);
388 }
389 #endif
390 
391 #if !PetscDefined(HAVE_MPIUNI)
392 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin, Vec *fin_new) {
393   Mat A;
394 
395   PetscFunctionBegin;
396   PetscCall(PetscObjectQuery((PetscObject)fin, "FFTmatrix", (PetscObject *)&A));
397   PetscCall(MatCreateVecsFFTW_FFTW(A, fin_new, NULL, NULL));
398   PetscFunctionReturn(0);
399 }
400 
401 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout, Vec *fout_new) {
402   Mat A;
403 
404   PetscFunctionBegin;
405   PetscCall(PetscObjectQuery((PetscObject)fout, "FFTmatrix", (PetscObject *)&A));
406   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, fout_new, NULL));
407   PetscFunctionReturn(0);
408 }
409 
410 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new) {
411   Mat A;
412 
413   PetscFunctionBegin;
414   PetscCall(PetscObjectQuery((PetscObject)bout, "FFTmatrix", (PetscObject *)&A));
415   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, NULL, bout_new));
416   PetscFunctionReturn(0);
417 }
418 #endif
419 
420 /*@
421    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
422      parallel layout determined by `MATFFTW`
423 
424    Collective on Mat
425 
426    Input Parameter:
427 .   A - the matrix
428 
429    Output Parameters:
430 +   x - (optional) input vector of forward FFTW
431 .   y - (optional) output vector of forward FFTW
432 -   z - (optional) output vector of backward FFTW
433 
434   Level: advanced
435 
436   Notes:
437   The parallel layout of output of forward FFTW is always same as the input
438   of the backward FFTW. But parallel layout of the input vector of forward
439   FFTW might not be same as the output of backward FFTW.
440 
441   We need to provide enough space while doing parallel real transform.
442   We need to pad extra zeros at the end of last dimension. For this reason the one needs to
443   invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
444   last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
445   depends on if the last dimension is even or odd. If the last dimension is even need to pad two
446   zeros if it is odd only one zero is needed.
447 
448   Lastly one needs some scratch space at the end of data set in each process. alloc_local
449   figures out how much space is needed, i.e. it figures out the data+scratch space for
450   each processor and returns that.
451 
452   Developer Note:
453   How come `MatCreateVecs()` doesn't produce the correctly padded vectors automatically?
454 
455 .seealso: `MATFFTW`, `MatCreateFFT()`, `MatCreateVecs()`
456 @*/
457 PetscErrorCode MatCreateVecsFFTW(Mat A, Vec *x, Vec *y, Vec *z) {
458   PetscFunctionBegin;
459   PetscUseMethod(A, "MatCreateVecsFFTW_C", (Mat, Vec *, Vec *, Vec *), (A, x, y, z));
460   PetscFunctionReturn(0);
461 }
462 
463 PetscErrorCode MatCreateVecsFFTW_FFTW(Mat A, Vec *fin, Vec *fout, Vec *bout) {
464   PetscMPIInt size, rank;
465   MPI_Comm    comm;
466   Mat_FFT    *fft = (Mat_FFT *)A->data;
467 
468   PetscFunctionBegin;
469   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
470   PetscValidType(A, 1);
471   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
472 
473   PetscCallMPI(MPI_Comm_size(comm, &size));
474   PetscCallMPI(MPI_Comm_rank(comm, &rank));
475   if (size == 1) { /* sequential case */
476 #if defined(PETSC_USE_COMPLEX)
477     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fin));
478     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fout));
479     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, bout));
480 #else
481     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fin));
482     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fout));
483     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, bout));
484 #endif
485 #if !PetscDefined(HAVE_MPIUNI)
486   } else { /* parallel cases */
487     Mat_FFTW     *fftw = (Mat_FFTW *)fft->data;
488     PetscInt      ndim = fft->ndim, *dim = fft->dim;
489     ptrdiff_t     alloc_local, local_n0, local_0_start;
490     ptrdiff_t     local_n1;
491     fftw_complex *data_fout;
492     ptrdiff_t     local_1_start;
493 #if defined(PETSC_USE_COMPLEX)
494     fftw_complex *data_fin, *data_bout;
495 #else
496     double   *data_finr, *data_boutr;
497     PetscInt  n1, N1;
498     ptrdiff_t temp;
499 #endif
500 
501     switch (ndim) {
502     case 1:
503 #if !defined(PETSC_USE_COMPLEX)
504       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not allow parallel real 1D transform");
505 #else
506       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
507       if (fin) {
508         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
509         PetscCall(VecCreateMPIWithArray(comm, 1, local_n0, fft->N, (const PetscScalar *)data_fin, fin));
510         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
511         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
512         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
513       }
514       if (fout) {
515         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
516         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_fout, fout));
517         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
518         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
519         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
520       }
521       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
522       if (bout) {
523         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
524         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_bout, bout));
525         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
526         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
527         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
528       }
529       break;
530 #endif
531     case 2:
532 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
533       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
534       N1          = 2 * dim[0] * (dim[1] / 2 + 1);
535       n1          = 2 * local_n0 * (dim[1] / 2 + 1);
536       if (fin) {
537         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
538         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
539         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
540         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
541         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
542       }
543       if (fout) {
544         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
545         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_fout, fout));
546         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
547         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
548         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
549       }
550       if (bout) {
551         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
552         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
553         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
554         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
555         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
556       }
557 #else
558       /* Get local size */
559       alloc_local = fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
560       if (fin) {
561         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
562         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
563         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
564         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
565         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
566       }
567       if (fout) {
568         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
569         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
570         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
571         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
572         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
573       }
574       if (bout) {
575         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
576         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
577         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
578         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
579         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
580       }
581 #endif
582       break;
583     case 3:
584 #if !defined(PETSC_USE_COMPLEX)
585       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
586       N1          = 2 * dim[0] * dim[1] * (dim[2] / 2 + 1);
587       n1          = 2 * local_n0 * dim[1] * (dim[2] / 2 + 1);
588       if (fin) {
589         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
590         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
591         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
592         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
593         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
594       }
595       if (fout) {
596         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
597         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (PetscScalar *)data_fout, fout));
598         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
599         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
600         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
601       }
602       if (bout) {
603         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
604         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
605         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
606         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
607         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
608       }
609 #else
610       alloc_local = fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
611       if (fin) {
612         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
613         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
614         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
615         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
616         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
617       }
618       if (fout) {
619         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
620         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
621         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
622         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
623         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
624       }
625       if (bout) {
626         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
627         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
628         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
629         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
630         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
631       }
632 #endif
633       break;
634     default:
635 #if !defined(PETSC_USE_COMPLEX)
636       temp                                  = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
637       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
638       alloc_local                           = fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
639       N1                                    = 2 * fft->N * (PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw - 1]) / ((PetscInt)temp);
640       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
641 
642       if (fin) {
643         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
644         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_finr, fin));
645         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
646         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
647         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
648       }
649       if (fout) {
650         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
651         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_fout, fout));
652         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
653         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
654         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
655       }
656       if (bout) {
657         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
658         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_boutr, bout));
659         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
660         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
661         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
662       }
663 #else
664       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
665       if (fin) {
666         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
667         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
668         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
669         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
670         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
671       }
672       if (fout) {
673         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
674         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
675         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
676         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
677         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
678       }
679       if (bout) {
680         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
681         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
682         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
683         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
684         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
685       }
686 #endif
687       break;
688     }
689     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
690        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
691        memory leaks. We void these pointers here to avoid acident uses.
692     */
693     if (fin) (*fin)->ops->replacearray = NULL;
694     if (fout) (*fout)->ops->replacearray = NULL;
695     if (bout) (*bout)->ops->replacearray = NULL;
696 #endif
697   }
698   PetscFunctionReturn(0);
699 }
700 
701 /*@
702    VecScatterPetscToFFTW - Copies a PETSc vector to the vector that goes into `MATFFTW` calls.
703 
704    Collective on Mat
705 
706    Input Parameters:
707 +  A - FFTW matrix
708 -  x - the PETSc vector
709 
710    Output Parameters:
711 .  y - the FFTW vector
712 
713    Level: intermediate
714 
715    Note:
716    For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
717    one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
718    zeros. This routine does that job by scattering operation.
719 
720 .seealso: `MATFFTW`, `VecScatterFFTWToPetsc()`, `MatCreateVecsFFTW()`
721 @*/
722 PetscErrorCode VecScatterPetscToFFTW(Mat A, Vec x, Vec y) {
723   PetscFunctionBegin;
724   PetscUseMethod(A, "VecScatterPetscToFFTW_C", (Mat, Vec, Vec), (A, x, y));
725   PetscFunctionReturn(0);
726 }
727 
728 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A, Vec x, Vec y) {
729   MPI_Comm    comm;
730   Mat_FFT    *fft = (Mat_FFT *)A->data;
731   PetscInt    low;
732   PetscMPIInt rank, size;
733   PetscInt    vsize, vsize1;
734   VecScatter  vecscat;
735   IS          list1;
736 
737   PetscFunctionBegin;
738   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
739   PetscCallMPI(MPI_Comm_size(comm, &size));
740   PetscCallMPI(MPI_Comm_rank(comm, &rank));
741   PetscCall(VecGetOwnershipRange(y, &low, NULL));
742 
743   if (size == 1) {
744     PetscCall(VecGetSize(x, &vsize));
745     PetscCall(VecGetSize(y, &vsize1));
746     PetscCall(ISCreateStride(PETSC_COMM_SELF, fft->N, 0, 1, &list1));
747     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
748     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
749     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
750     PetscCall(VecScatterDestroy(&vecscat));
751     PetscCall(ISDestroy(&list1));
752 #if !PetscDefined(HAVE_MPIUNI)
753   } else {
754     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
755     PetscInt  ndim = fft->ndim, *dim = fft->dim;
756     ptrdiff_t local_n0, local_0_start;
757     ptrdiff_t local_n1, local_1_start;
758     IS        list2;
759 #if !defined(PETSC_USE_COMPLEX)
760     PetscInt  i, j, k, partial_dim;
761     PetscInt *indx1, *indx2, tempindx, tempindx1;
762     PetscInt  NM;
763     ptrdiff_t temp;
764 #endif
765 
766     switch (ndim) {
767     case 1:
768 #if defined(PETSC_USE_COMPLEX)
769       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
770 
771       PetscCall(ISCreateStride(comm, local_n0, local_0_start, 1, &list1));
772       PetscCall(ISCreateStride(comm, local_n0, low, 1, &list2));
773       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
774       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
775       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
776       PetscCall(VecScatterDestroy(&vecscat));
777       PetscCall(ISDestroy(&list1));
778       PetscCall(ISDestroy(&list2));
779 #else
780       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
781 #endif
782       break;
783     case 2:
784 #if defined(PETSC_USE_COMPLEX)
785       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
786 
787       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
788       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
789       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
790       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
791       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
792       PetscCall(VecScatterDestroy(&vecscat));
793       PetscCall(ISDestroy(&list1));
794       PetscCall(ISDestroy(&list2));
795 #else
796       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
797 
798       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1));
799       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2));
800 
801       if (dim[1] % 2 == 0) {
802         NM = dim[1] + 2;
803       } else {
804         NM = dim[1] + 1;
805       }
806       for (i = 0; i < local_n0; i++) {
807         for (j = 0; j < dim[1]; j++) {
808           tempindx  = i * dim[1] + j;
809           tempindx1 = i * NM + j;
810 
811           indx1[tempindx] = local_0_start * dim[1] + tempindx;
812           indx2[tempindx] = low + tempindx1;
813         }
814       }
815 
816       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
817       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
818 
819       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
820       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
821       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
822       PetscCall(VecScatterDestroy(&vecscat));
823       PetscCall(ISDestroy(&list1));
824       PetscCall(ISDestroy(&list2));
825       PetscCall(PetscFree(indx1));
826       PetscCall(PetscFree(indx2));
827 #endif
828       break;
829 
830     case 3:
831 #if defined(PETSC_USE_COMPLEX)
832       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
833 
834       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
835       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
836       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
837       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
838       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
839       PetscCall(VecScatterDestroy(&vecscat));
840       PetscCall(ISDestroy(&list1));
841       PetscCall(ISDestroy(&list2));
842 #else
843       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
844       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 3D real transform");
845       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
846 
847       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1));
848       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2));
849 
850       if (dim[2] % 2 == 0) NM = dim[2] + 2;
851       else NM = dim[2] + 1;
852 
853       for (i = 0; i < local_n0; i++) {
854         for (j = 0; j < dim[1]; j++) {
855           for (k = 0; k < dim[2]; k++) {
856             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
857             tempindx1 = i * dim[1] * NM + j * NM + k;
858 
859             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
860             indx2[tempindx] = low + tempindx1;
861           }
862         }
863       }
864 
865       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
866       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
867       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
868       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
869       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
870       PetscCall(VecScatterDestroy(&vecscat));
871       PetscCall(ISDestroy(&list1));
872       PetscCall(ISDestroy(&list2));
873       PetscCall(PetscFree(indx1));
874       PetscCall(PetscFree(indx2));
875 #endif
876       break;
877 
878     default:
879 #if defined(PETSC_USE_COMPLEX)
880       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
881 
882       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
883       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
884       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
885       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
886       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
887       PetscCall(VecScatterDestroy(&vecscat));
888       PetscCall(ISDestroy(&list1));
889       PetscCall(ISDestroy(&list2));
890 #else
891       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
892       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel DIM>3 real transform");
893       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
894 
895       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
896 
897       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
898 
899       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
900 
901       partial_dim = fftw->partial_dim;
902 
903       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1));
904       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2));
905 
906       if (dim[ndim - 1] % 2 == 0) NM = 2;
907       else NM = 1;
908 
909       j = low;
910       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
911         indx1[i] = local_0_start * partial_dim + i;
912         indx2[i] = j;
913         if (k % dim[ndim - 1] == 0) j += NM;
914         j++;
915       }
916       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
917       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
918       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
919       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
920       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
921       PetscCall(VecScatterDestroy(&vecscat));
922       PetscCall(ISDestroy(&list1));
923       PetscCall(ISDestroy(&list2));
924       PetscCall(PetscFree(indx1));
925       PetscCall(PetscFree(indx2));
926 #endif
927       break;
928     }
929 #endif
930   }
931   PetscFunctionReturn(0);
932 }
933 
934 /*@
935    VecScatterFFTWToPetsc - Converts `MATFFTW` output vector to a PETSc vector.
936 
937    Collective on Mat
938 
939     Input Parameters:
940 +   A - `MATFFTW` matrix
941 -   x - FFTW vector
942 
943    Output Parameters:
944 .  y - PETSc vector
945 
946    Level: intermediate
947 
948    Note:
949    While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
950    `VecScatterFFTWToPetsc()` removes those extra zeros.
951 
952 .seealso: `VecScatterPetscToFFTW()`, `MATFFTW`, `MatCreateVecsFFTW()`
953 @*/
954 PetscErrorCode VecScatterFFTWToPetsc(Mat A, Vec x, Vec y) {
955   PetscFunctionBegin;
956   PetscUseMethod(A, "VecScatterFFTWToPetsc_C", (Mat, Vec, Vec), (A, x, y));
957   PetscFunctionReturn(0);
958 }
959 
960 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A, Vec x, Vec y) {
961   MPI_Comm    comm;
962   Mat_FFT    *fft = (Mat_FFT *)A->data;
963   PetscInt    low;
964   PetscMPIInt rank, size;
965   VecScatter  vecscat;
966   IS          list1;
967 
968   PetscFunctionBegin;
969   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
970   PetscCallMPI(MPI_Comm_size(comm, &size));
971   PetscCallMPI(MPI_Comm_rank(comm, &rank));
972   PetscCall(VecGetOwnershipRange(x, &low, NULL));
973 
974   if (size == 1) {
975     PetscCall(ISCreateStride(comm, fft->N, 0, 1, &list1));
976     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
977     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
978     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
979     PetscCall(VecScatterDestroy(&vecscat));
980     PetscCall(ISDestroy(&list1));
981 
982 #if !PetscDefined(HAVE_MPIUNI)
983   } else {
984     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
985     PetscInt  ndim = fft->ndim, *dim = fft->dim;
986     ptrdiff_t local_n0, local_0_start;
987     ptrdiff_t local_n1, local_1_start;
988     IS        list2;
989 #if !defined(PETSC_USE_COMPLEX)
990     PetscInt  i, j, k, partial_dim;
991     PetscInt *indx1, *indx2, tempindx, tempindx1;
992     PetscInt  NM;
993     ptrdiff_t temp;
994 #endif
995     switch (ndim) {
996     case 1:
997 #if defined(PETSC_USE_COMPLEX)
998       fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
999 
1000       PetscCall(ISCreateStride(comm, local_n1, local_1_start, 1, &list1));
1001       PetscCall(ISCreateStride(comm, local_n1, low, 1, &list2));
1002       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1003       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1004       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1005       PetscCall(VecScatterDestroy(&vecscat));
1006       PetscCall(ISDestroy(&list1));
1007       PetscCall(ISDestroy(&list2));
1008 #else
1009       SETERRQ(comm, PETSC_ERR_SUP, "No support for real parallel 1D FFT");
1010 #endif
1011       break;
1012     case 2:
1013 #if defined(PETSC_USE_COMPLEX)
1014       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1015 
1016       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
1017       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
1018       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1019       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1020       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1021       PetscCall(VecScatterDestroy(&vecscat));
1022       PetscCall(ISDestroy(&list1));
1023       PetscCall(ISDestroy(&list2));
1024 #else
1025       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1026 
1027       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1));
1028       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2));
1029 
1030       if (dim[1] % 2 == 0) NM = dim[1] + 2;
1031       else NM = dim[1] + 1;
1032 
1033       for (i = 0; i < local_n0; i++) {
1034         for (j = 0; j < dim[1]; j++) {
1035           tempindx  = i * dim[1] + j;
1036           tempindx1 = i * NM + j;
1037 
1038           indx1[tempindx] = local_0_start * dim[1] + tempindx;
1039           indx2[tempindx] = low + tempindx1;
1040         }
1041       }
1042 
1043       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
1044       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
1045 
1046       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1047       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1048       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1049       PetscCall(VecScatterDestroy(&vecscat));
1050       PetscCall(ISDestroy(&list1));
1051       PetscCall(ISDestroy(&list2));
1052       PetscCall(PetscFree(indx1));
1053       PetscCall(PetscFree(indx2));
1054 #endif
1055       break;
1056     case 3:
1057 #if defined(PETSC_USE_COMPLEX)
1058       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1059 
1060       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
1061       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
1062       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1063       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1064       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1065       PetscCall(VecScatterDestroy(&vecscat));
1066       PetscCall(ISDestroy(&list1));
1067       PetscCall(ISDestroy(&list2));
1068 #else
1069       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1070 
1071       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1));
1072       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2));
1073 
1074       if (dim[2] % 2 == 0) NM = dim[2] + 2;
1075       else NM = dim[2] + 1;
1076 
1077       for (i = 0; i < local_n0; i++) {
1078         for (j = 0; j < dim[1]; j++) {
1079           for (k = 0; k < dim[2]; k++) {
1080             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
1081             tempindx1 = i * dim[1] * NM + j * NM + k;
1082 
1083             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
1084             indx2[tempindx] = low + tempindx1;
1085           }
1086         }
1087       }
1088 
1089       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
1090       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
1091 
1092       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1093       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1094       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1095       PetscCall(VecScatterDestroy(&vecscat));
1096       PetscCall(ISDestroy(&list1));
1097       PetscCall(ISDestroy(&list2));
1098       PetscCall(PetscFree(indx1));
1099       PetscCall(PetscFree(indx2));
1100 #endif
1101       break;
1102     default:
1103 #if defined(PETSC_USE_COMPLEX)
1104       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
1105 
1106       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
1107       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
1108       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1109       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1110       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1111       PetscCall(VecScatterDestroy(&vecscat));
1112       PetscCall(ISDestroy(&list1));
1113       PetscCall(ISDestroy(&list2));
1114 #else
1115       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
1116 
1117       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
1118 
1119       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1120 
1121       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
1122 
1123       partial_dim = fftw->partial_dim;
1124 
1125       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1));
1126       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2));
1127 
1128       if (dim[ndim - 1] % 2 == 0) NM = 2;
1129       else NM = 1;
1130 
1131       j = low;
1132       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
1133         indx1[i] = local_0_start * partial_dim + i;
1134         indx2[i] = j;
1135         if (k % dim[ndim - 1] == 0) j += NM;
1136         j++;
1137       }
1138       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
1139       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
1140 
1141       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1142       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1143       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1144       PetscCall(VecScatterDestroy(&vecscat));
1145       PetscCall(ISDestroy(&list1));
1146       PetscCall(ISDestroy(&list2));
1147       PetscCall(PetscFree(indx1));
1148       PetscCall(PetscFree(indx2));
1149 #endif
1150       break;
1151     }
1152 #endif
1153   }
1154   PetscFunctionReturn(0);
1155 }
1156 
1157 /*
1158     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1159 
1160   Options Database Keys:
1161 + -mat_fftw_plannerflags - set FFTW planner flags
1162 
1163    Level: intermediate
1164 
1165 */
1166 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A) {
1167   MPI_Comm    comm;
1168   Mat_FFT    *fft = (Mat_FFT *)A->data;
1169   Mat_FFTW   *fftw;
1170   PetscInt    ndim = fft->ndim, *dim = fft->dim;
1171   const char *plans[]  = {"FFTW_ESTIMATE", "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE"};
1172   unsigned    iplans[] = {FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE};
1173   PetscBool   flg;
1174   PetscInt    p_flag, partial_dim = 1, ctr;
1175   PetscMPIInt size, rank;
1176   ptrdiff_t  *pdim;
1177 #if !defined(PETSC_USE_COMPLEX)
1178   PetscInt tot_dim;
1179 #endif
1180 
1181   PetscFunctionBegin;
1182   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1183   PetscCallMPI(MPI_Comm_size(comm, &size));
1184   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1185 
1186 #if !PetscDefined(HAVE_MPIUNI)
1187   fftw_mpi_init();
1188 #endif
1189   pdim    = (ptrdiff_t *)calloc(ndim, sizeof(ptrdiff_t));
1190   pdim[0] = dim[0];
1191 #if !defined(PETSC_USE_COMPLEX)
1192   tot_dim = 2 * dim[0];
1193 #endif
1194   for (ctr = 1; ctr < ndim; ctr++) {
1195     partial_dim *= dim[ctr];
1196     pdim[ctr] = dim[ctr];
1197 #if !defined(PETSC_USE_COMPLEX)
1198     if (ctr == ndim - 1) tot_dim *= (dim[ctr] / 2 + 1);
1199     else tot_dim *= dim[ctr];
1200 #endif
1201   }
1202 
1203   if (size == 1) {
1204 #if defined(PETSC_USE_COMPLEX)
1205     PetscCall(MatSetSizes(A, fft->N, fft->N, fft->N, fft->N));
1206     fft->n = fft->N;
1207 #else
1208     PetscCall(MatSetSizes(A, tot_dim, tot_dim, tot_dim, tot_dim));
1209     fft->n       = tot_dim;
1210 #endif
1211 #if !PetscDefined(HAVE_MPIUNI)
1212   } else {
1213     ptrdiff_t local_n0, local_0_start, local_n1, local_1_start;
1214 #if !defined(PETSC_USE_COMPLEX)
1215     ptrdiff_t temp;
1216     PetscInt  N1;
1217 #endif
1218 
1219     switch (ndim) {
1220     case 1:
1221 #if !defined(PETSC_USE_COMPLEX)
1222       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
1223 #else
1224       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1225       fft->n = (PetscInt)local_n0;
1226       PetscCall(MatSetSizes(A, local_n1, fft->n, fft->N, fft->N));
1227 #endif
1228       break;
1229     case 2:
1230 #if defined(PETSC_USE_COMPLEX)
1231       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1232       fft->n = (PetscInt)local_n0 * dim[1];
1233       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1234 #else
1235       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1236 
1237       fft->n = 2 * (PetscInt)local_n0 * (dim[1] / 2 + 1);
1238       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * (dim[1] / 2 + 1), 2 * dim[0] * (dim[1] / 2 + 1)));
1239 #endif
1240       break;
1241     case 3:
1242 #if defined(PETSC_USE_COMPLEX)
1243       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1244 
1245       fft->n = (PetscInt)local_n0 * dim[1] * dim[2];
1246       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1247 #else
1248       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1249 
1250       fft->n = 2 * (PetscInt)local_n0 * dim[1] * (dim[2] / 2 + 1);
1251       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * dim[1] * (dim[2] / 2 + 1), 2 * dim[0] * dim[1] * (dim[2] / 2 + 1)));
1252 #endif
1253       break;
1254     default:
1255 #if defined(PETSC_USE_COMPLEX)
1256       fftw_mpi_local_size(ndim, pdim, comm, &local_n0, &local_0_start);
1257 
1258       fft->n = (PetscInt)local_n0 * partial_dim;
1259       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1260 #else
1261       temp = pdim[ndim - 1];
1262 
1263       pdim[ndim - 1] = temp / 2 + 1;
1264 
1265       fftw_mpi_local_size_transposed(ndim, pdim, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1266 
1267       fft->n = 2 * (PetscInt)local_n0 * partial_dim * pdim[ndim - 1] / temp;
1268       N1     = 2 * fft->N * (PetscInt)pdim[ndim - 1] / ((PetscInt)temp);
1269 
1270       pdim[ndim - 1] = temp;
1271 
1272       PetscCall(MatSetSizes(A, fft->n, fft->n, N1, N1));
1273 #endif
1274       break;
1275     }
1276 #endif
1277   }
1278   free(pdim);
1279   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATFFTW));
1280   PetscCall(PetscNewLog(A, &fftw));
1281   fft->data = (void *)fftw;
1282 
1283   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1284   fftw->partial_dim = partial_dim;
1285 
1286   PetscCall(PetscMalloc1(ndim, &(fftw->dim_fftw)));
1287   if (size == 1) {
1288 #if defined(PETSC_USE_64BIT_INDICES)
1289     fftw->iodims = (fftw_iodim64 *)malloc(sizeof(fftw_iodim64) * ndim);
1290 #else
1291     fftw->iodims = (fftw_iodim *)malloc(sizeof(fftw_iodim) * ndim);
1292 #endif
1293   }
1294 
1295   for (ctr = 0; ctr < ndim; ctr++) (fftw->dim_fftw)[ctr] = dim[ctr];
1296 
1297   fftw->p_forward  = NULL;
1298   fftw->p_backward = NULL;
1299   fftw->p_flag     = FFTW_ESTIMATE;
1300 
1301   if (size == 1) {
1302     A->ops->mult          = MatMult_SeqFFTW;
1303     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1304 #if !PetscDefined(HAVE_MPIUNI)
1305   } else {
1306     A->ops->mult          = MatMult_MPIFFTW;
1307     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1308 #endif
1309   }
1310   fft->matdestroy = MatDestroy_FFTW;
1311   A->assembled    = PETSC_TRUE;
1312   A->preallocated = PETSC_TRUE;
1313 
1314   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", MatCreateVecsFFTW_FFTW));
1315   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", VecScatterPetscToFFTW_FFTW));
1316   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", VecScatterFFTWToPetsc_FFTW));
1317 
1318   /* get runtime options */
1319   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "FFTW Options", "Mat");
1320   PetscCall(PetscOptionsEList("-mat_fftw_plannerflags", "Planner Flags", "None", plans, 4, plans[0], &p_flag, &flg));
1321   if (flg) fftw->p_flag = iplans[p_flag];
1322   PetscOptionsEnd();
1323   PetscFunctionReturn(0);
1324 }
1325