xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 21e3ffae2f3b73c0bd738cf6d0a809700fc04bb0)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h> /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #if !PetscDefined(HAVE_MPIUNI)
10   #include <fftw3-mpi.h>
11 #else
12   #include <fftw3.h>
13 #endif
14 EXTERN_C_END
15 
16 typedef struct {
17   ptrdiff_t ndim_fftw, *dim_fftw;
18 #if defined(PETSC_USE_64BIT_INDICES)
19   fftw_iodim64 *iodims;
20 #else
21   fftw_iodim *iodims;
22 #endif
23   PetscInt     partial_dim;
24   fftw_plan    p_forward, p_backward;
25   unsigned     p_flag;                                      /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
26   PetscScalar *finarray, *foutarray, *binarray, *boutarray; /* keep track of arrays because fftw plan should be
27                                                             executed for the arrays with which the plan was created */
28 } Mat_FFTW;
29 
30 extern PetscErrorCode MatMult_SeqFFTW(Mat, Vec, Vec);
31 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat, Vec, Vec);
32 extern PetscErrorCode MatDestroy_FFTW(Mat);
33 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat, Vec *, Vec *, Vec *);
34 #if !PetscDefined(HAVE_MPIUNI)
35 extern PetscErrorCode MatMult_MPIFFTW(Mat, Vec, Vec);
36 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat, Vec, Vec);
37 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
38 #endif
39 
40 /*
41    MatMult_SeqFFTW performs forward DFT
42    Input parameter:
43      A - the matrix
44      x - the vector on which FDFT will be performed
45 
46    Output parameter:
47      y - vector that stores result of FDFT
48 */
49 PetscErrorCode MatMult_SeqFFTW(Mat A, Vec x, Vec y)
50 {
51   Mat_FFT           *fft  = (Mat_FFT *)A->data;
52   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
53   const PetscScalar *x_array;
54   PetscScalar       *y_array;
55 #if defined(PETSC_USE_COMPLEX)
56   #if defined(PETSC_USE_64BIT_INDICES)
57   fftw_iodim64 *iodims;
58   #else
59   fftw_iodim *iodims;
60   #endif
61   PetscInt i;
62 #endif
63   PetscInt ndim = fft->ndim, *dim = fft->dim;
64 
65   PetscFunctionBegin;
66   PetscCall(VecGetArrayRead(x, &x_array));
67   PetscCall(VecGetArray(y, &y_array));
68   if (!fftw->p_forward) { /* create a plan, then execute it */
69     switch (ndim) {
70     case 1:
71 #if defined(PETSC_USE_COMPLEX)
72       fftw->p_forward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
73 #else
74       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
75 #endif
76       break;
77     case 2:
78 #if defined(PETSC_USE_COMPLEX)
79       fftw->p_forward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
80 #else
81       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
82 #endif
83       break;
84     case 3:
85 #if defined(PETSC_USE_COMPLEX)
86       fftw->p_forward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
87 #else
88       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
89 #endif
90       break;
91     default:
92 #if defined(PETSC_USE_COMPLEX)
93       iodims = fftw->iodims;
94   #if defined(PETSC_USE_64BIT_INDICES)
95       if (ndim) {
96         iodims[ndim - 1].n  = (ptrdiff_t)dim[ndim - 1];
97         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
98         for (i = ndim - 2; i >= 0; --i) {
99           iodims[i].n  = (ptrdiff_t)dim[i];
100           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
101         }
102       }
103       fftw->p_forward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
104   #else
105       if (ndim) {
106         iodims[ndim - 1].n  = (int)dim[ndim - 1];
107         iodims[ndim - 1].is = iodims[ndim - 1].os = 1;
108         for (i = ndim - 2; i >= 0; --i) {
109           iodims[i].n  = (int)dim[i];
110           iodims[i].is = iodims[i].os = iodims[i + 1].is * iodims[i + 1].n;
111         }
112       }
113       fftw->p_forward = fftw_plan_guru_dft((int)ndim, (fftw_iodim *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_FORWARD, fftw->p_flag);
114   #endif
115 
116 #else
117       fftw->p_forward = fftw_plan_dft_r2c(ndim, (int *)dim, (double *)x_array, (fftw_complex *)y_array, fftw->p_flag);
118 #endif
119       break;
120     }
121     fftw->finarray  = (PetscScalar *)x_array;
122     fftw->foutarray = y_array;
123     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
124                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
125     fftw_execute(fftw->p_forward);
126   } else {                                                         /* use existing plan */
127     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
128 #if defined(PETSC_USE_COMPLEX)
129       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
130 #else
131       fftw_execute_dft_r2c(fftw->p_forward, (double *)x_array, (fftw_complex *)y_array);
132 #endif
133     } else {
134       fftw_execute(fftw->p_forward);
135     }
136   }
137   PetscCall(VecRestoreArray(y, &y_array));
138   PetscCall(VecRestoreArrayRead(x, &x_array));
139   PetscFunctionReturn(PETSC_SUCCESS);
140 }
141 
142 /* MatMultTranspose_SeqFFTW performs serial backward DFT
143    Input parameter:
144      A - the matrix
145      x - the vector on which BDFT will be performed
146 
147    Output parameter:
148      y - vector that stores result of BDFT
149 */
150 
151 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A, Vec x, Vec y)
152 {
153   Mat_FFT           *fft  = (Mat_FFT *)A->data;
154   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
155   const PetscScalar *x_array;
156   PetscScalar       *y_array;
157   PetscInt           ndim = fft->ndim, *dim = fft->dim;
158 #if defined(PETSC_USE_COMPLEX)
159   #if defined(PETSC_USE_64BIT_INDICES)
160   fftw_iodim64 *iodims = fftw->iodims;
161   #else
162   fftw_iodim *iodims = fftw->iodims;
163   #endif
164 #endif
165 
166   PetscFunctionBegin;
167   PetscCall(VecGetArrayRead(x, &x_array));
168   PetscCall(VecGetArray(y, &y_array));
169   if (!fftw->p_backward) { /* create a plan, then execute it */
170     switch (ndim) {
171     case 1:
172 #if defined(PETSC_USE_COMPLEX)
173       fftw->p_backward = fftw_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
174 #else
175       fftw->p_backward = fftw_plan_dft_c2r_1d(dim[0], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
176 #endif
177       break;
178     case 2:
179 #if defined(PETSC_USE_COMPLEX)
180       fftw->p_backward = fftw_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
181 #else
182       fftw->p_backward = fftw_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
183 #endif
184       break;
185     case 3:
186 #if defined(PETSC_USE_COMPLEX)
187       fftw->p_backward = fftw_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
188 #else
189       fftw->p_backward = fftw_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
190 #endif
191       break;
192     default:
193 #if defined(PETSC_USE_COMPLEX)
194   #if defined(PETSC_USE_64BIT_INDICES)
195       fftw->p_backward = fftw_plan_guru64_dft((int)ndim, (fftw_iodim64 *)iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
196   #else
197       fftw->p_backward = fftw_plan_guru_dft((int)ndim, iodims, 0, NULL, (fftw_complex *)x_array, (fftw_complex *)y_array, FFTW_BACKWARD, fftw->p_flag);
198   #endif
199 #else
200       fftw->p_backward = fftw_plan_dft_c2r((int)ndim, (int *)dim, (fftw_complex *)x_array, (double *)y_array, fftw->p_flag);
201 #endif
202       break;
203     }
204     fftw->binarray  = (PetscScalar *)x_array;
205     fftw->boutarray = y_array;
206     fftw_execute(fftw->p_backward);
207   } else {                                                         /* use existing plan */
208     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
209 #if defined(PETSC_USE_COMPLEX)
210       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
211 #else
212       fftw_execute_dft_c2r(fftw->p_backward, (fftw_complex *)x_array, (double *)y_array);
213 #endif
214     } else {
215       fftw_execute(fftw->p_backward);
216     }
217   }
218   PetscCall(VecRestoreArray(y, &y_array));
219   PetscCall(VecRestoreArrayRead(x, &x_array));
220   PetscFunctionReturn(PETSC_SUCCESS);
221 }
222 
223 #if !PetscDefined(HAVE_MPIUNI)
224 /* MatMult_MPIFFTW performs forward DFT in parallel
225    Input parameter:
226      A - the matrix
227      x - the vector on which FDFT will be performed
228 
229    Output parameter:
230    y   - vector that stores result of FDFT
231 */
232 PetscErrorCode MatMult_MPIFFTW(Mat A, Vec x, Vec y)
233 {
234   Mat_FFT           *fft  = (Mat_FFT *)A->data;
235   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
236   const PetscScalar *x_array;
237   PetscScalar       *y_array;
238   PetscInt           ndim = fft->ndim, *dim = fft->dim;
239   MPI_Comm           comm;
240 
241   PetscFunctionBegin;
242   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
243   PetscCall(VecGetArrayRead(x, &x_array));
244   PetscCall(VecGetArray(y, &y_array));
245   if (!fftw->p_forward) { /* create a plan, then execute it */
246     switch (ndim) {
247     case 1:
248   #if defined(PETSC_USE_COMPLEX)
249       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
250   #else
251       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
252   #endif
253       break;
254     case 2:
255   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
256       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
257   #else
258       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0], dim[1], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
259   #endif
260       break;
261     case 3:
262   #if defined(PETSC_USE_COMPLEX)
263       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
264   #else
265       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0], dim[1], dim[2], (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
266   #endif
267       break;
268     default:
269   #if defined(PETSC_USE_COMPLEX)
270       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_FORWARD, fftw->p_flag);
271   #else
272       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw, fftw->dim_fftw, (double *)x_array, (fftw_complex *)y_array, comm, FFTW_ESTIMATE);
273   #endif
274       break;
275     }
276     fftw->finarray  = (PetscScalar *)x_array;
277     fftw->foutarray = y_array;
278     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
279                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
280     fftw_execute(fftw->p_forward);
281   } else {                                                         /* use existing plan */
282     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
283       fftw_execute_dft(fftw->p_forward, (fftw_complex *)x_array, (fftw_complex *)y_array);
284     } else {
285       fftw_execute(fftw->p_forward);
286     }
287   }
288   PetscCall(VecRestoreArray(y, &y_array));
289   PetscCall(VecRestoreArrayRead(x, &x_array));
290   PetscFunctionReturn(PETSC_SUCCESS);
291 }
292 
293 /*
294    MatMultTranspose_MPIFFTW performs parallel backward DFT
295    Input parameter:
296      A - the matrix
297      x - the vector on which BDFT will be performed
298 
299    Output parameter:
300      y - vector that stores result of BDFT
301 */
302 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A, Vec x, Vec y)
303 {
304   Mat_FFT           *fft  = (Mat_FFT *)A->data;
305   Mat_FFTW          *fftw = (Mat_FFTW *)fft->data;
306   const PetscScalar *x_array;
307   PetscScalar       *y_array;
308   PetscInt           ndim = fft->ndim, *dim = fft->dim;
309   MPI_Comm           comm;
310 
311   PetscFunctionBegin;
312   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
313   PetscCall(VecGetArrayRead(x, &x_array));
314   PetscCall(VecGetArray(y, &y_array));
315   if (!fftw->p_backward) { /* create a plan, then execute it */
316     switch (ndim) {
317     case 1:
318   #if defined(PETSC_USE_COMPLEX)
319       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
320   #else
321       SETERRQ(comm, PETSC_ERR_SUP, "not support for real numbers yet");
322   #endif
323       break;
324     case 2:
325   #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
326       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0], dim[1], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
327   #else
328       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0], dim[1], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
329   #endif
330       break;
331     case 3:
332   #if defined(PETSC_USE_COMPLEX)
333       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
334   #else
335       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0], dim[1], dim[2], (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
336   #endif
337       break;
338     default:
339   #if defined(PETSC_USE_COMPLEX)
340       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (fftw_complex *)y_array, comm, FFTW_BACKWARD, fftw->p_flag);
341   #else
342       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw, fftw->dim_fftw, (fftw_complex *)x_array, (double *)y_array, comm, FFTW_ESTIMATE);
343   #endif
344       break;
345     }
346     fftw->binarray  = (PetscScalar *)x_array;
347     fftw->boutarray = y_array;
348     fftw_execute(fftw->p_backward);
349   } else {                                                         /* use existing plan */
350     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
351       fftw_execute_dft(fftw->p_backward, (fftw_complex *)x_array, (fftw_complex *)y_array);
352     } else {
353       fftw_execute(fftw->p_backward);
354     }
355   }
356   PetscCall(VecRestoreArray(y, &y_array));
357   PetscCall(VecRestoreArrayRead(x, &x_array));
358   PetscFunctionReturn(PETSC_SUCCESS);
359 }
360 #endif
361 
362 PetscErrorCode MatDestroy_FFTW(Mat A)
363 {
364   Mat_FFT  *fft  = (Mat_FFT *)A->data;
365   Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
366 
367   PetscFunctionBegin;
368   fftw_destroy_plan(fftw->p_forward);
369   fftw_destroy_plan(fftw->p_backward);
370   if (fftw->iodims) free(fftw->iodims);
371   PetscCall(PetscFree(fftw->dim_fftw));
372   PetscCall(PetscFree(fft->data));
373   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", NULL));
374   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", NULL));
375   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", NULL));
376 #if !PetscDefined(HAVE_MPIUNI)
377   fftw_mpi_cleanup();
378 #endif
379   PetscFunctionReturn(PETSC_SUCCESS);
380 }
381 
382 #if !PetscDefined(HAVE_MPIUNI)
383   #include <../src/vec/vec/impls/mpi/pvecimpl.h> /*I  "petscvec.h"   I*/
384 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
385 {
386   PetscScalar *array;
387 
388   PetscFunctionBegin;
389   PetscCall(VecGetArray(v, &array));
390   fftw_free((fftw_complex *)array);
391   PetscCall(VecRestoreArray(v, &array));
392   PetscCall(VecDestroy_MPI(v));
393   PetscFunctionReturn(PETSC_SUCCESS);
394 }
395 #endif
396 
397 #if !PetscDefined(HAVE_MPIUNI)
398 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin, Vec *fin_new)
399 {
400   Mat A;
401 
402   PetscFunctionBegin;
403   PetscCall(PetscObjectQuery((PetscObject)fin, "FFTmatrix", (PetscObject *)&A));
404   PetscCall(MatCreateVecsFFTW_FFTW(A, fin_new, NULL, NULL));
405   PetscFunctionReturn(PETSC_SUCCESS);
406 }
407 
408 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout, Vec *fout_new)
409 {
410   Mat A;
411 
412   PetscFunctionBegin;
413   PetscCall(PetscObjectQuery((PetscObject)fout, "FFTmatrix", (PetscObject *)&A));
414   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, fout_new, NULL));
415   PetscFunctionReturn(PETSC_SUCCESS);
416 }
417 
418 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
419 {
420   Mat A;
421 
422   PetscFunctionBegin;
423   PetscCall(PetscObjectQuery((PetscObject)bout, "FFTmatrix", (PetscObject *)&A));
424   PetscCall(MatCreateVecsFFTW_FFTW(A, NULL, NULL, bout_new));
425   PetscFunctionReturn(PETSC_SUCCESS);
426 }
427 #endif
428 
429 /*@
430    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
431      parallel layout determined by `MATFFTW`
432 
433    Collective
434 
435    Input Parameter:
436 .   A - the matrix
437 
438    Output Parameters:
439 +   x - (optional) input vector of forward FFTW
440 .   y - (optional) output vector of forward FFTW
441 -   z - (optional) output vector of backward FFTW
442 
443   Level: advanced
444 
445   Notes:
446   The parallel layout of output of forward FFTW is always same as the input
447   of the backward FFTW. But parallel layout of the input vector of forward
448   FFTW might not be same as the output of backward FFTW.
449 
450   We need to provide enough space while doing parallel real transform.
451   We need to pad extra zeros at the end of last dimension. For this reason the one needs to
452   invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
453   last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
454   depends on if the last dimension is even or odd. If the last dimension is even need to pad two
455   zeros if it is odd only one zero is needed.
456 
457   Lastly one needs some scratch space at the end of data set in each process. alloc_local
458   figures out how much space is needed, i.e. it figures out the data+scratch space for
459   each processor and returns that.
460 
461   Developer Note:
462   How come `MatCreateVecs()` doesn't produce the correctly padded vectors automatically?
463 
464 .seealso: `MATFFTW`, `MatCreateFFT()`, `MatCreateVecs()`
465 @*/
466 PetscErrorCode MatCreateVecsFFTW(Mat A, Vec *x, Vec *y, Vec *z)
467 {
468   PetscFunctionBegin;
469   PetscUseMethod(A, "MatCreateVecsFFTW_C", (Mat, Vec *, Vec *, Vec *), (A, x, y, z));
470   PetscFunctionReturn(PETSC_SUCCESS);
471 }
472 
473 PetscErrorCode MatCreateVecsFFTW_FFTW(Mat A, Vec *fin, Vec *fout, Vec *bout)
474 {
475   PetscMPIInt size, rank;
476   MPI_Comm    comm;
477   Mat_FFT    *fft = (Mat_FFT *)A->data;
478 
479   PetscFunctionBegin;
480   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
481   PetscValidType(A, 1);
482   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
483 
484   PetscCallMPI(MPI_Comm_size(comm, &size));
485   PetscCallMPI(MPI_Comm_rank(comm, &rank));
486   if (size == 1) { /* sequential case */
487 #if defined(PETSC_USE_COMPLEX)
488     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fin));
489     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, fout));
490     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->N, bout));
491 #else
492     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fin));
493     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, fout));
494     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF, fft->n, bout));
495 #endif
496 #if !PetscDefined(HAVE_MPIUNI)
497   } else { /* parallel cases */
498     Mat_FFTW     *fftw = (Mat_FFTW *)fft->data;
499     PetscInt      ndim = fft->ndim, *dim = fft->dim;
500     ptrdiff_t     alloc_local, local_n0, local_0_start;
501     ptrdiff_t     local_n1;
502     fftw_complex *data_fout;
503     ptrdiff_t     local_1_start;
504   #if defined(PETSC_USE_COMPLEX)
505     fftw_complex *data_fin, *data_bout;
506   #else
507     double   *data_finr, *data_boutr;
508     PetscInt  n1, N1;
509     ptrdiff_t temp;
510   #endif
511 
512     switch (ndim) {
513     case 1:
514   #if !defined(PETSC_USE_COMPLEX)
515       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not allow parallel real 1D transform");
516   #else
517       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
518       if (fin) {
519         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
520         PetscCall(VecCreateMPIWithArray(comm, 1, local_n0, fft->N, (const PetscScalar *)data_fin, fin));
521         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
522         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
523         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
524       }
525       if (fout) {
526         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
527         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_fout, fout));
528         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
529         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
530         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
531       }
532       alloc_local = fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
533       if (bout) {
534         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
535         PetscCall(VecCreateMPIWithArray(comm, 1, local_n1, fft->N, (const PetscScalar *)data_bout, bout));
536         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
537         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
538         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
539       }
540       break;
541   #endif
542     case 2:
543   #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
544       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
545       N1          = 2 * dim[0] * (dim[1] / 2 + 1);
546       n1          = 2 * local_n0 * (dim[1] / 2 + 1);
547       if (fin) {
548         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
549         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
550         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
551         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
552         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
553       }
554       if (fout) {
555         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
556         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_fout, fout));
557         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
558         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
559         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
560       }
561       if (bout) {
562         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
563         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
564         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
565         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
566         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
567       }
568   #else
569       /* Get local size */
570       alloc_local = fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
571       if (fin) {
572         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
573         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
574         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
575         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
576         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
577       }
578       if (fout) {
579         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
580         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
581         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
582         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
583         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
584       }
585       if (bout) {
586         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
587         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
588         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
589         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
590         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
591       }
592   #endif
593       break;
594     case 3:
595   #if !defined(PETSC_USE_COMPLEX)
596       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
597       N1          = 2 * dim[0] * dim[1] * (dim[2] / 2 + 1);
598       n1          = 2 * local_n0 * dim[1] * (dim[2] / 2 + 1);
599       if (fin) {
600         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
601         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_finr, fin));
602         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
603         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
604         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
605       }
606       if (fout) {
607         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
608         PetscCall(VecCreateMPIWithArray(comm, 1, n1, N1, (PetscScalar *)data_fout, fout));
609         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
610         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
611         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
612       }
613       if (bout) {
614         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
615         PetscCall(VecCreateMPIWithArray(comm, 1, (PetscInt)n1, N1, (PetscScalar *)data_boutr, bout));
616         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
617         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
618         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
619       }
620   #else
621       alloc_local = fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
622       if (fin) {
623         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
624         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
625         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
626         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
627         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
628       }
629       if (fout) {
630         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
631         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
632         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
633         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
634         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
635       }
636       if (bout) {
637         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
638         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
639         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
640         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
641         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
642       }
643   #endif
644       break;
645     default:
646   #if !defined(PETSC_USE_COMPLEX)
647       temp                                  = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
648       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
649       alloc_local                           = fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
650       N1                                    = 2 * fft->N * (PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw - 1]) / ((PetscInt)temp);
651       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
652 
653       if (fin) {
654         data_finr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
655         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_finr, fin));
656         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
657         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
658         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
659       }
660       if (fout) {
661         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
662         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_fout, fout));
663         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
664         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
665         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
666       }
667       if (bout) {
668         data_boutr = (double *)fftw_malloc(sizeof(double) * alloc_local * 2);
669         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, N1, (PetscScalar *)data_boutr, bout));
670         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
671         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
672         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
673       }
674   #else
675       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
676       if (fin) {
677         data_fin = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
678         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fin, fin));
679         PetscCall(PetscObjectCompose((PetscObject)*fin, "FFTmatrix", (PetscObject)A));
680         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
681         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
682       }
683       if (fout) {
684         data_fout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
685         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_fout, fout));
686         PetscCall(PetscObjectCompose((PetscObject)*fout, "FFTmatrix", (PetscObject)A));
687         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
688         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
689       }
690       if (bout) {
691         data_bout = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * alloc_local);
692         PetscCall(VecCreateMPIWithArray(comm, 1, fft->n, fft->N, (const PetscScalar *)data_bout, bout));
693         PetscCall(PetscObjectCompose((PetscObject)*bout, "FFTmatrix", (PetscObject)A));
694         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
695         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
696       }
697   #endif
698       break;
699     }
700     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
701        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
702        memory leaks. We void these pointers here to avoid accident uses.
703     */
704     if (fin) (*fin)->ops->replacearray = NULL;
705     if (fout) (*fout)->ops->replacearray = NULL;
706     if (bout) (*bout)->ops->replacearray = NULL;
707 #endif
708   }
709   PetscFunctionReturn(PETSC_SUCCESS);
710 }
711 
712 /*@
713    VecScatterPetscToFFTW - Copies a PETSc vector to the vector that goes into `MATFFTW` calls.
714 
715    Collective
716 
717    Input Parameters:
718 +  A - FFTW matrix
719 -  x - the PETSc vector
720 
721    Output Parameters:
722 .  y - the FFTW vector
723 
724    Level: intermediate
725 
726    Note:
727    For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
728    one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
729    zeros. This routine does that job by scattering operation.
730 
731 .seealso: `MATFFTW`, `VecScatterFFTWToPetsc()`, `MatCreateVecsFFTW()`
732 @*/
733 PetscErrorCode VecScatterPetscToFFTW(Mat A, Vec x, Vec y)
734 {
735   PetscFunctionBegin;
736   PetscUseMethod(A, "VecScatterPetscToFFTW_C", (Mat, Vec, Vec), (A, x, y));
737   PetscFunctionReturn(PETSC_SUCCESS);
738 }
739 
740 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A, Vec x, Vec y)
741 {
742   MPI_Comm    comm;
743   Mat_FFT    *fft = (Mat_FFT *)A->data;
744   PetscInt    low;
745   PetscMPIInt rank, size;
746   PetscInt    vsize, vsize1;
747   VecScatter  vecscat;
748   IS          list1;
749 
750   PetscFunctionBegin;
751   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
752   PetscCallMPI(MPI_Comm_size(comm, &size));
753   PetscCallMPI(MPI_Comm_rank(comm, &rank));
754   PetscCall(VecGetOwnershipRange(y, &low, NULL));
755 
756   if (size == 1) {
757     PetscCall(VecGetSize(x, &vsize));
758     PetscCall(VecGetSize(y, &vsize1));
759     PetscCall(ISCreateStride(PETSC_COMM_SELF, fft->N, 0, 1, &list1));
760     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
761     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
762     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
763     PetscCall(VecScatterDestroy(&vecscat));
764     PetscCall(ISDestroy(&list1));
765 #if !PetscDefined(HAVE_MPIUNI)
766   } else {
767     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
768     PetscInt  ndim = fft->ndim, *dim = fft->dim;
769     ptrdiff_t local_n0, local_0_start;
770     ptrdiff_t local_n1, local_1_start;
771     IS        list2;
772   #if !defined(PETSC_USE_COMPLEX)
773     PetscInt  i, j, k, partial_dim;
774     PetscInt *indx1, *indx2, tempindx, tempindx1;
775     PetscInt  NM;
776     ptrdiff_t temp;
777   #endif
778 
779     switch (ndim) {
780     case 1:
781   #if defined(PETSC_USE_COMPLEX)
782       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
783 
784       PetscCall(ISCreateStride(comm, local_n0, local_0_start, 1, &list1));
785       PetscCall(ISCreateStride(comm, local_n0, low, 1, &list2));
786       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
787       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
788       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
789       PetscCall(VecScatterDestroy(&vecscat));
790       PetscCall(ISDestroy(&list1));
791       PetscCall(ISDestroy(&list2));
792   #else
793       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
794   #endif
795       break;
796     case 2:
797   #if defined(PETSC_USE_COMPLEX)
798       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
799 
800       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
801       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
802       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
803       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
804       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
805       PetscCall(VecScatterDestroy(&vecscat));
806       PetscCall(ISDestroy(&list1));
807       PetscCall(ISDestroy(&list2));
808   #else
809       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
810 
811       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1));
812       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2));
813 
814       if (dim[1] % 2 == 0) {
815         NM = dim[1] + 2;
816       } else {
817         NM = dim[1] + 1;
818       }
819       for (i = 0; i < local_n0; i++) {
820         for (j = 0; j < dim[1]; j++) {
821           tempindx  = i * dim[1] + j;
822           tempindx1 = i * NM + j;
823 
824           indx1[tempindx] = local_0_start * dim[1] + tempindx;
825           indx2[tempindx] = low + tempindx1;
826         }
827       }
828 
829       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
830       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
831 
832       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
833       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
834       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
835       PetscCall(VecScatterDestroy(&vecscat));
836       PetscCall(ISDestroy(&list1));
837       PetscCall(ISDestroy(&list2));
838       PetscCall(PetscFree(indx1));
839       PetscCall(PetscFree(indx2));
840   #endif
841       break;
842 
843     case 3:
844   #if defined(PETSC_USE_COMPLEX)
845       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
846 
847       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
848       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
849       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
850       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
851       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
852       PetscCall(VecScatterDestroy(&vecscat));
853       PetscCall(ISDestroy(&list1));
854       PetscCall(ISDestroy(&list2));
855   #else
856       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
857       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 3D real transform");
858       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
859 
860       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1));
861       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2));
862 
863       if (dim[2] % 2 == 0) NM = dim[2] + 2;
864       else NM = dim[2] + 1;
865 
866       for (i = 0; i < local_n0; i++) {
867         for (j = 0; j < dim[1]; j++) {
868           for (k = 0; k < dim[2]; k++) {
869             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
870             tempindx1 = i * dim[1] * NM + j * NM + k;
871 
872             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
873             indx2[tempindx] = low + tempindx1;
874           }
875         }
876       }
877 
878       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
879       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
880       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
881       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
882       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
883       PetscCall(VecScatterDestroy(&vecscat));
884       PetscCall(ISDestroy(&list1));
885       PetscCall(ISDestroy(&list2));
886       PetscCall(PetscFree(indx1));
887       PetscCall(PetscFree(indx2));
888   #endif
889       break;
890 
891     default:
892   #if defined(PETSC_USE_COMPLEX)
893       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
894 
895       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
896       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
897       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
898       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
899       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
900       PetscCall(VecScatterDestroy(&vecscat));
901       PetscCall(ISDestroy(&list1));
902       PetscCall(ISDestroy(&list2));
903   #else
904       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
905       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel DIM>3 real transform");
906       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
907 
908       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
909 
910       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
911 
912       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
913 
914       partial_dim = fftw->partial_dim;
915 
916       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1));
917       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2));
918 
919       if (dim[ndim - 1] % 2 == 0) NM = 2;
920       else NM = 1;
921 
922       j = low;
923       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
924         indx1[i] = local_0_start * partial_dim + i;
925         indx2[i] = j;
926         if (k % dim[ndim - 1] == 0) j += NM;
927         j++;
928       }
929       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
930       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
931       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
932       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
933       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
934       PetscCall(VecScatterDestroy(&vecscat));
935       PetscCall(ISDestroy(&list1));
936       PetscCall(ISDestroy(&list2));
937       PetscCall(PetscFree(indx1));
938       PetscCall(PetscFree(indx2));
939   #endif
940       break;
941     }
942 #endif
943   }
944   PetscFunctionReturn(PETSC_SUCCESS);
945 }
946 
947 /*@
948    VecScatterFFTWToPetsc - Converts `MATFFTW` output vector to a PETSc vector.
949 
950    Collective
951 
952     Input Parameters:
953 +   A - `MATFFTW` matrix
954 -   x - FFTW vector
955 
956    Output Parameters:
957 .  y - PETSc vector
958 
959    Level: intermediate
960 
961    Note:
962    While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
963    `VecScatterFFTWToPetsc()` removes those extra zeros.
964 
965 .seealso: `VecScatterPetscToFFTW()`, `MATFFTW`, `MatCreateVecsFFTW()`
966 @*/
967 PetscErrorCode VecScatterFFTWToPetsc(Mat A, Vec x, Vec y)
968 {
969   PetscFunctionBegin;
970   PetscUseMethod(A, "VecScatterFFTWToPetsc_C", (Mat, Vec, Vec), (A, x, y));
971   PetscFunctionReturn(PETSC_SUCCESS);
972 }
973 
974 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A, Vec x, Vec y)
975 {
976   MPI_Comm    comm;
977   Mat_FFT    *fft = (Mat_FFT *)A->data;
978   PetscInt    low;
979   PetscMPIInt rank, size;
980   VecScatter  vecscat;
981   IS          list1;
982 
983   PetscFunctionBegin;
984   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
985   PetscCallMPI(MPI_Comm_size(comm, &size));
986   PetscCallMPI(MPI_Comm_rank(comm, &rank));
987   PetscCall(VecGetOwnershipRange(x, &low, NULL));
988 
989   if (size == 1) {
990     PetscCall(ISCreateStride(comm, fft->N, 0, 1, &list1));
991     PetscCall(VecScatterCreate(x, list1, y, list1, &vecscat));
992     PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
993     PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
994     PetscCall(VecScatterDestroy(&vecscat));
995     PetscCall(ISDestroy(&list1));
996 
997 #if !PetscDefined(HAVE_MPIUNI)
998   } else {
999     Mat_FFTW *fftw = (Mat_FFTW *)fft->data;
1000     PetscInt  ndim = fft->ndim, *dim = fft->dim;
1001     ptrdiff_t local_n0, local_0_start;
1002     ptrdiff_t local_n1, local_1_start;
1003     IS        list2;
1004   #if !defined(PETSC_USE_COMPLEX)
1005     PetscInt  i, j, k, partial_dim;
1006     PetscInt *indx1, *indx2, tempindx, tempindx1;
1007     PetscInt  NM;
1008     ptrdiff_t temp;
1009   #endif
1010     switch (ndim) {
1011     case 1:
1012   #if defined(PETSC_USE_COMPLEX)
1013       fftw_mpi_local_size_1d(dim[0], comm, FFTW_BACKWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1014 
1015       PetscCall(ISCreateStride(comm, local_n1, local_1_start, 1, &list1));
1016       PetscCall(ISCreateStride(comm, local_n1, low, 1, &list2));
1017       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1018       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1019       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1020       PetscCall(VecScatterDestroy(&vecscat));
1021       PetscCall(ISDestroy(&list1));
1022       PetscCall(ISDestroy(&list2));
1023   #else
1024       SETERRQ(comm, PETSC_ERR_SUP, "No support for real parallel 1D FFT");
1025   #endif
1026       break;
1027     case 2:
1028   #if defined(PETSC_USE_COMPLEX)
1029       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1030 
1031       PetscCall(ISCreateStride(comm, local_n0 * dim[1], local_0_start * dim[1], 1, &list1));
1032       PetscCall(ISCreateStride(comm, local_n0 * dim[1], low, 1, &list2));
1033       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1034       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1035       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1036       PetscCall(VecScatterDestroy(&vecscat));
1037       PetscCall(ISDestroy(&list1));
1038       PetscCall(ISDestroy(&list2));
1039   #else
1040       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1041 
1042       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx1));
1043       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1], &indx2));
1044 
1045       if (dim[1] % 2 == 0) NM = dim[1] + 2;
1046       else NM = dim[1] + 1;
1047 
1048       for (i = 0; i < local_n0; i++) {
1049         for (j = 0; j < dim[1]; j++) {
1050           tempindx  = i * dim[1] + j;
1051           tempindx1 = i * NM + j;
1052 
1053           indx1[tempindx] = local_0_start * dim[1] + tempindx;
1054           indx2[tempindx] = low + tempindx1;
1055         }
1056       }
1057 
1058       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx1, PETSC_COPY_VALUES, &list1));
1059       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1], indx2, PETSC_COPY_VALUES, &list2));
1060 
1061       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1062       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1063       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1064       PetscCall(VecScatterDestroy(&vecscat));
1065       PetscCall(ISDestroy(&list1));
1066       PetscCall(ISDestroy(&list2));
1067       PetscCall(PetscFree(indx1));
1068       PetscCall(PetscFree(indx2));
1069   #endif
1070       break;
1071     case 3:
1072   #if defined(PETSC_USE_COMPLEX)
1073       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1074 
1075       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], local_0_start * dim[1] * dim[2], 1, &list1));
1076       PetscCall(ISCreateStride(comm, local_n0 * dim[1] * dim[2], low, 1, &list2));
1077       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1078       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1079       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1080       PetscCall(VecScatterDestroy(&vecscat));
1081       PetscCall(ISDestroy(&list1));
1082       PetscCall(ISDestroy(&list2));
1083   #else
1084       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1085 
1086       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx1));
1087       PetscCall(PetscMalloc1(((PetscInt)local_n0) * dim[1] * dim[2], &indx2));
1088 
1089       if (dim[2] % 2 == 0) NM = dim[2] + 2;
1090       else NM = dim[2] + 1;
1091 
1092       for (i = 0; i < local_n0; i++) {
1093         for (j = 0; j < dim[1]; j++) {
1094           for (k = 0; k < dim[2]; k++) {
1095             tempindx  = i * dim[1] * dim[2] + j * dim[2] + k;
1096             tempindx1 = i * dim[1] * NM + j * NM + k;
1097 
1098             indx1[tempindx] = local_0_start * dim[1] * dim[2] + tempindx;
1099             indx2[tempindx] = low + tempindx1;
1100           }
1101         }
1102       }
1103 
1104       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx1, PETSC_COPY_VALUES, &list1));
1105       PetscCall(ISCreateGeneral(comm, local_n0 * dim[1] * dim[2], indx2, PETSC_COPY_VALUES, &list2));
1106 
1107       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1108       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1109       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1110       PetscCall(VecScatterDestroy(&vecscat));
1111       PetscCall(ISDestroy(&list1));
1112       PetscCall(ISDestroy(&list2));
1113       PetscCall(PetscFree(indx1));
1114       PetscCall(PetscFree(indx2));
1115   #endif
1116       break;
1117     default:
1118   #if defined(PETSC_USE_COMPLEX)
1119       fftw_mpi_local_size(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start);
1120 
1121       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), local_0_start * (fftw->partial_dim), 1, &list1));
1122       PetscCall(ISCreateStride(comm, local_n0 * (fftw->partial_dim), low, 1, &list2));
1123       PetscCall(VecScatterCreate(x, list1, y, list2, &vecscat));
1124       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1125       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1126       PetscCall(VecScatterDestroy(&vecscat));
1127       PetscCall(ISDestroy(&list1));
1128       PetscCall(ISDestroy(&list2));
1129   #else
1130       temp = (fftw->dim_fftw)[fftw->ndim_fftw - 1];
1131 
1132       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp / 2 + 1;
1133 
1134       fftw_mpi_local_size_transposed(fftw->ndim_fftw, fftw->dim_fftw, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1135 
1136       (fftw->dim_fftw)[fftw->ndim_fftw - 1] = temp;
1137 
1138       partial_dim = fftw->partial_dim;
1139 
1140       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx1));
1141       PetscCall(PetscMalloc1(((PetscInt)local_n0) * partial_dim, &indx2));
1142 
1143       if (dim[ndim - 1] % 2 == 0) NM = 2;
1144       else NM = 1;
1145 
1146       j = low;
1147       for (i = 0, k = 1; i < ((PetscInt)local_n0) * partial_dim; i++, k++) {
1148         indx1[i] = local_0_start * partial_dim + i;
1149         indx2[i] = j;
1150         if (k % dim[ndim - 1] == 0) j += NM;
1151         j++;
1152       }
1153       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx1, PETSC_COPY_VALUES, &list1));
1154       PetscCall(ISCreateGeneral(comm, local_n0 * partial_dim, indx2, PETSC_COPY_VALUES, &list2));
1155 
1156       PetscCall(VecScatterCreate(x, list2, y, list1, &vecscat));
1157       PetscCall(VecScatterBegin(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1158       PetscCall(VecScatterEnd(vecscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
1159       PetscCall(VecScatterDestroy(&vecscat));
1160       PetscCall(ISDestroy(&list1));
1161       PetscCall(ISDestroy(&list2));
1162       PetscCall(PetscFree(indx1));
1163       PetscCall(PetscFree(indx2));
1164   #endif
1165       break;
1166     }
1167 #endif
1168   }
1169   PetscFunctionReturn(PETSC_SUCCESS);
1170 }
1171 
1172 /*
1173     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1174 
1175   Options Database Keys:
1176 + -mat_fftw_plannerflags - set FFTW planner flags
1177 
1178    Level: intermediate
1179 
1180 */
1181 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1182 {
1183   MPI_Comm    comm;
1184   Mat_FFT    *fft = (Mat_FFT *)A->data;
1185   Mat_FFTW   *fftw;
1186   PetscInt    ndim = fft->ndim, *dim = fft->dim;
1187   const char *plans[]  = {"FFTW_ESTIMATE", "FFTW_MEASURE", "FFTW_PATIENT", "FFTW_EXHAUSTIVE"};
1188   unsigned    iplans[] = {FFTW_ESTIMATE, FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE};
1189   PetscBool   flg;
1190   PetscInt    p_flag, partial_dim = 1, ctr;
1191   PetscMPIInt size, rank;
1192   ptrdiff_t  *pdim;
1193 #if !defined(PETSC_USE_COMPLEX)
1194   PetscInt tot_dim;
1195 #endif
1196 
1197   PetscFunctionBegin;
1198   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
1199   PetscCallMPI(MPI_Comm_size(comm, &size));
1200   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1201 
1202 #if !PetscDefined(HAVE_MPIUNI)
1203   fftw_mpi_init();
1204 #endif
1205   pdim    = (ptrdiff_t *)calloc(ndim, sizeof(ptrdiff_t));
1206   pdim[0] = dim[0];
1207 #if !defined(PETSC_USE_COMPLEX)
1208   tot_dim = 2 * dim[0];
1209 #endif
1210   for (ctr = 1; ctr < ndim; ctr++) {
1211     partial_dim *= dim[ctr];
1212     pdim[ctr] = dim[ctr];
1213 #if !defined(PETSC_USE_COMPLEX)
1214     if (ctr == ndim - 1) tot_dim *= (dim[ctr] / 2 + 1);
1215     else tot_dim *= dim[ctr];
1216 #endif
1217   }
1218 
1219   if (size == 1) {
1220 #if defined(PETSC_USE_COMPLEX)
1221     PetscCall(MatSetSizes(A, fft->N, fft->N, fft->N, fft->N));
1222     fft->n = fft->N;
1223 #else
1224     PetscCall(MatSetSizes(A, tot_dim, tot_dim, tot_dim, tot_dim));
1225     fft->n       = tot_dim;
1226 #endif
1227 #if !PetscDefined(HAVE_MPIUNI)
1228   } else {
1229     ptrdiff_t local_n0, local_0_start, local_n1, local_1_start;
1230   #if !defined(PETSC_USE_COMPLEX)
1231     ptrdiff_t temp;
1232     PetscInt  N1;
1233   #endif
1234 
1235     switch (ndim) {
1236     case 1:
1237   #if !defined(PETSC_USE_COMPLEX)
1238       SETERRQ(comm, PETSC_ERR_SUP, "FFTW does not support parallel 1D real transform");
1239   #else
1240       fftw_mpi_local_size_1d(dim[0], comm, FFTW_FORWARD, FFTW_ESTIMATE, &local_n0, &local_0_start, &local_n1, &local_1_start);
1241       fft->n = (PetscInt)local_n0;
1242       PetscCall(MatSetSizes(A, local_n1, fft->n, fft->N, fft->N));
1243   #endif
1244       break;
1245     case 2:
1246   #if defined(PETSC_USE_COMPLEX)
1247       fftw_mpi_local_size_2d(dim[0], dim[1], comm, &local_n0, &local_0_start);
1248       fft->n = (PetscInt)local_n0 * dim[1];
1249       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1250   #else
1251       fftw_mpi_local_size_2d_transposed(dim[0], dim[1] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1252 
1253       fft->n = 2 * (PetscInt)local_n0 * (dim[1] / 2 + 1);
1254       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * (dim[1] / 2 + 1), 2 * dim[0] * (dim[1] / 2 + 1)));
1255   #endif
1256       break;
1257     case 3:
1258   #if defined(PETSC_USE_COMPLEX)
1259       fftw_mpi_local_size_3d(dim[0], dim[1], dim[2], comm, &local_n0, &local_0_start);
1260 
1261       fft->n = (PetscInt)local_n0 * dim[1] * dim[2];
1262       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1263   #else
1264       fftw_mpi_local_size_3d_transposed(dim[0], dim[1], dim[2] / 2 + 1, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1265 
1266       fft->n = 2 * (PetscInt)local_n0 * dim[1] * (dim[2] / 2 + 1);
1267       PetscCall(MatSetSizes(A, fft->n, fft->n, 2 * dim[0] * dim[1] * (dim[2] / 2 + 1), 2 * dim[0] * dim[1] * (dim[2] / 2 + 1)));
1268   #endif
1269       break;
1270     default:
1271   #if defined(PETSC_USE_COMPLEX)
1272       fftw_mpi_local_size(ndim, pdim, comm, &local_n0, &local_0_start);
1273 
1274       fft->n = (PetscInt)local_n0 * partial_dim;
1275       PetscCall(MatSetSizes(A, fft->n, fft->n, fft->N, fft->N));
1276   #else
1277       temp = pdim[ndim - 1];
1278 
1279       pdim[ndim - 1] = temp / 2 + 1;
1280 
1281       fftw_mpi_local_size_transposed(ndim, pdim, comm, &local_n0, &local_0_start, &local_n1, &local_1_start);
1282 
1283       fft->n = 2 * (PetscInt)local_n0 * partial_dim * pdim[ndim - 1] / temp;
1284       N1     = 2 * fft->N * (PetscInt)pdim[ndim - 1] / ((PetscInt)temp);
1285 
1286       pdim[ndim - 1] = temp;
1287 
1288       PetscCall(MatSetSizes(A, fft->n, fft->n, N1, N1));
1289   #endif
1290       break;
1291     }
1292 #endif
1293   }
1294   free(pdim);
1295   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATFFTW));
1296   PetscCall(PetscNew(&fftw));
1297   fft->data = (void *)fftw;
1298 
1299   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1300   fftw->partial_dim = partial_dim;
1301 
1302   PetscCall(PetscMalloc1(ndim, &(fftw->dim_fftw)));
1303   if (size == 1) {
1304 #if defined(PETSC_USE_64BIT_INDICES)
1305     fftw->iodims = (fftw_iodim64 *)malloc(sizeof(fftw_iodim64) * ndim);
1306 #else
1307     fftw->iodims = (fftw_iodim *)malloc(sizeof(fftw_iodim) * ndim);
1308 #endif
1309   }
1310 
1311   for (ctr = 0; ctr < ndim; ctr++) (fftw->dim_fftw)[ctr] = dim[ctr];
1312 
1313   fftw->p_forward  = NULL;
1314   fftw->p_backward = NULL;
1315   fftw->p_flag     = FFTW_ESTIMATE;
1316 
1317   if (size == 1) {
1318     A->ops->mult          = MatMult_SeqFFTW;
1319     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1320 #if !PetscDefined(HAVE_MPIUNI)
1321   } else {
1322     A->ops->mult          = MatMult_MPIFFTW;
1323     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1324 #endif
1325   }
1326   fft->matdestroy = MatDestroy_FFTW;
1327   A->assembled    = PETSC_TRUE;
1328   A->preallocated = PETSC_TRUE;
1329 
1330   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCreateVecsFFTW_C", MatCreateVecsFFTW_FFTW));
1331   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterPetscToFFTW_C", VecScatterPetscToFFTW_FFTW));
1332   PetscCall(PetscObjectComposeFunction((PetscObject)A, "VecScatterFFTWToPetsc_C", VecScatterFFTWToPetsc_FFTW));
1333 
1334   /* get runtime options */
1335   PetscOptionsBegin(PetscObjectComm((PetscObject)A), ((PetscObject)A)->prefix, "FFTW Options", "Mat");
1336   PetscCall(PetscOptionsEList("-mat_fftw_plannerflags", "Planner Flags", "None", plans, 4, plans[0], &p_flag, &flg));
1337   if (flg) fftw->p_flag = iplans[p_flag];
1338   PetscOptionsEnd();
1339   PetscFunctionReturn(PETSC_SUCCESS);
1340 }
1341