xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision ebead697dbf761eb322f829370bbe90b3bd93fa3)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #if !PetscDefined(HAVE_MPIUNI)
10 #include <fftw3-mpi.h>
11 #else
12 #include <fftw3.h>
13 #endif
14 EXTERN_C_END
15 
16 typedef struct {
17   ptrdiff_t    ndim_fftw,*dim_fftw;
18 #if defined(PETSC_USE_64BIT_INDICES)
19   fftw_iodim64 *iodims;
20 #else
21   fftw_iodim   *iodims;
22 #endif
23   PetscInt     partial_dim;
24   fftw_plan    p_forward,p_backward;
25   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
26   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
27                                                             executed for the arrays with which the plan was created */
28 } Mat_FFTW;
29 
30 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
31 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
32 extern PetscErrorCode MatDestroy_FFTW(Mat);
33 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat,Vec*,Vec*,Vec*);
34 #if !PetscDefined(HAVE_MPIUNI)
35 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
36 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
37 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
38 #endif
39 
40 /*
41    MatMult_SeqFFTW performs forward DFT
42    Input parameter:
43      A - the matrix
44      x - the vector on which FDFT will be performed
45 
46    Output parameter:
47      y - vector that stores result of FDFT
48 */
49 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
50 {
51   Mat_FFT        *fft  = (Mat_FFT*)A->data;
52   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
53   const PetscScalar *x_array;
54   PetscScalar    *y_array;
55 #if defined(PETSC_USE_COMPLEX)
56 #if defined(PETSC_USE_64BIT_INDICES)
57   fftw_iodim64   *iodims;
58 #else
59   fftw_iodim     *iodims;
60 #endif
61   PetscInt       i;
62 #endif
63   PetscInt       ndim = fft->ndim,*dim = fft->dim;
64 
65   PetscFunctionBegin;
66   PetscCall(VecGetArrayRead(x,&x_array));
67   PetscCall(VecGetArray(y,&y_array));
68   if (!fftw->p_forward) { /* create a plan, then execute it */
69     switch (ndim) {
70     case 1:
71 #if defined(PETSC_USE_COMPLEX)
72       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
73 #else
74       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
75 #endif
76       break;
77     case 2:
78 #if defined(PETSC_USE_COMPLEX)
79       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
80 #else
81       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
82 #endif
83       break;
84     case 3:
85 #if defined(PETSC_USE_COMPLEX)
86       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
87 #else
88       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
89 #endif
90       break;
91     default:
92 #if defined(PETSC_USE_COMPLEX)
93       iodims = fftw->iodims;
94 #if defined(PETSC_USE_64BIT_INDICES)
95       if (ndim) {
96         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
97         iodims[ndim-1].is = iodims[ndim-1].os = 1;
98         for (i=ndim-2; i>=0; --i) {
99           iodims[i].n = (ptrdiff_t)dim[i];
100           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
101         }
102       }
103       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
104 #else
105       if (ndim) {
106         iodims[ndim-1].n = (int)dim[ndim-1];
107         iodims[ndim-1].is = iodims[ndim-1].os = 1;
108         for (i=ndim-2; i>=0; --i) {
109           iodims[i].n = (int)dim[i];
110           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
111         }
112       }
113       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
114 #endif
115 
116 #else
117       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
118 #endif
119       break;
120     }
121     fftw->finarray  = (PetscScalar *) x_array;
122     fftw->foutarray = y_array;
123     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
124                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
125     fftw_execute(fftw->p_forward);
126   } else { /* use existing plan */
127     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
128 #if defined(PETSC_USE_COMPLEX)
129       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
130 #else
131       fftw_execute_dft_r2c(fftw->p_forward,(double*)x_array,(fftw_complex*)y_array);
132 #endif
133     } else {
134       fftw_execute(fftw->p_forward);
135     }
136   }
137   PetscCall(VecRestoreArray(y,&y_array));
138   PetscCall(VecRestoreArrayRead(x,&x_array));
139   PetscFunctionReturn(0);
140 }
141 
142 /* MatMultTranspose_SeqFFTW performs serial backward DFT
143    Input parameter:
144      A - the matrix
145      x - the vector on which BDFT will be performed
146 
147    Output parameter:
148      y - vector that stores result of BDFT
149 */
150 
151 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
152 {
153   Mat_FFT        *fft  = (Mat_FFT*)A->data;
154   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
155   const PetscScalar *x_array;
156   PetscScalar    *y_array;
157   PetscInt       ndim = fft->ndim,*dim = fft->dim;
158 #if defined(PETSC_USE_COMPLEX)
159 #if defined(PETSC_USE_64BIT_INDICES)
160   fftw_iodim64   *iodims = fftw->iodims;
161 #else
162   fftw_iodim     *iodims = fftw->iodims;
163 #endif
164 #endif
165 
166   PetscFunctionBegin;
167   PetscCall(VecGetArrayRead(x,&x_array));
168   PetscCall(VecGetArray(y,&y_array));
169   if (!fftw->p_backward) { /* create a plan, then execute it */
170     switch (ndim) {
171     case 1:
172 #if defined(PETSC_USE_COMPLEX)
173       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
174 #else
175       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
176 #endif
177       break;
178     case 2:
179 #if defined(PETSC_USE_COMPLEX)
180       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
181 #else
182       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
183 #endif
184       break;
185     case 3:
186 #if defined(PETSC_USE_COMPLEX)
187       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
188 #else
189       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
190 #endif
191       break;
192     default:
193 #if defined(PETSC_USE_COMPLEX)
194 #if defined(PETSC_USE_64BIT_INDICES)
195       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
196 #else
197       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
198 #endif
199 #else
200       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
201 #endif
202       break;
203     }
204     fftw->binarray  = (PetscScalar *) x_array;
205     fftw->boutarray = y_array;
206     fftw_execute(fftw->p_backward);
207   } else { /* use existing plan */
208     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
209 #if defined(PETSC_USE_COMPLEX)
210       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
211 #else
212       fftw_execute_dft_c2r(fftw->p_backward,(fftw_complex*)x_array,(double*)y_array);
213 #endif
214     } else {
215       fftw_execute(fftw->p_backward);
216     }
217   }
218   PetscCall(VecRestoreArray(y,&y_array));
219   PetscCall(VecRestoreArrayRead(x,&x_array));
220   PetscFunctionReturn(0);
221 }
222 
223 #if !PetscDefined(HAVE_MPIUNI)
224 /* MatMult_MPIFFTW performs forward DFT in parallel
225    Input parameter:
226      A - the matrix
227      x - the vector on which FDFT will be performed
228 
229    Output parameter:
230    y   - vector that stores result of FDFT
231 */
232 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
233 {
234   Mat_FFT        *fft  = (Mat_FFT*)A->data;
235   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
236   const PetscScalar *x_array;
237   PetscScalar    *y_array;
238   PetscInt       ndim = fft->ndim,*dim = fft->dim;
239   MPI_Comm       comm;
240 
241   PetscFunctionBegin;
242   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
243   PetscCall(VecGetArrayRead(x,&x_array));
244   PetscCall(VecGetArray(y,&y_array));
245   if (!fftw->p_forward) { /* create a plan, then execute it */
246     switch (ndim) {
247     case 1:
248 #if defined(PETSC_USE_COMPLEX)
249       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
250 #else
251       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
252 #endif
253       break;
254     case 2:
255 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
256       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
257 #else
258       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
259 #endif
260       break;
261     case 3:
262 #if defined(PETSC_USE_COMPLEX)
263       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
264 #else
265       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
266 #endif
267       break;
268     default:
269 #if defined(PETSC_USE_COMPLEX)
270       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
271 #else
272       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
273 #endif
274       break;
275     }
276     fftw->finarray  = (PetscScalar *) x_array;
277     fftw->foutarray = y_array;
278     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
279                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
280     fftw_execute(fftw->p_forward);
281   } else { /* use existing plan */
282     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
283       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
284     } else {
285       fftw_execute(fftw->p_forward);
286     }
287   }
288   PetscCall(VecRestoreArray(y,&y_array));
289   PetscCall(VecRestoreArrayRead(x,&x_array));
290   PetscFunctionReturn(0);
291 }
292 
293 /*
294    MatMultTranspose_MPIFFTW performs parallel backward DFT
295    Input parameter:
296      A - the matrix
297      x - the vector on which BDFT will be performed
298 
299    Output parameter:
300      y - vector that stores result of BDFT
301 */
302 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
303 {
304   Mat_FFT        *fft  = (Mat_FFT*)A->data;
305   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
306   const PetscScalar *x_array;
307   PetscScalar    *y_array;
308   PetscInt       ndim = fft->ndim,*dim = fft->dim;
309   MPI_Comm       comm;
310 
311   PetscFunctionBegin;
312   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
313   PetscCall(VecGetArrayRead(x,&x_array));
314   PetscCall(VecGetArray(y,&y_array));
315   if (!fftw->p_backward) { /* create a plan, then execute it */
316     switch (ndim) {
317     case 1:
318 #if defined(PETSC_USE_COMPLEX)
319       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
320 #else
321       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
322 #endif
323       break;
324     case 2:
325 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
326       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
327 #else
328       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
329 #endif
330       break;
331     case 3:
332 #if defined(PETSC_USE_COMPLEX)
333       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
334 #else
335       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
336 #endif
337       break;
338     default:
339 #if defined(PETSC_USE_COMPLEX)
340       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
341 #else
342       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
343 #endif
344       break;
345     }
346     fftw->binarray  = (PetscScalar *) x_array;
347     fftw->boutarray = y_array;
348     fftw_execute(fftw->p_backward);
349   } else { /* use existing plan */
350     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
351       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
352     } else {
353       fftw_execute(fftw->p_backward);
354     }
355   }
356   PetscCall(VecRestoreArray(y,&y_array));
357   PetscCall(VecRestoreArrayRead(x,&x_array));
358   PetscFunctionReturn(0);
359 }
360 #endif
361 
362 PetscErrorCode MatDestroy_FFTW(Mat A)
363 {
364   Mat_FFT        *fft  = (Mat_FFT*)A->data;
365   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
366 
367   PetscFunctionBegin;
368   fftw_destroy_plan(fftw->p_forward);
369   fftw_destroy_plan(fftw->p_backward);
370   if (fftw->iodims) {
371     free(fftw->iodims);
372   }
373   PetscCall(PetscFree(fftw->dim_fftw));
374   PetscCall(PetscFree(fft->data));
375   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",NULL));
376   PetscCall(PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",NULL));
377   PetscCall(PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",NULL));
378 #if !PetscDefined(HAVE_MPIUNI)
379   fftw_mpi_cleanup();
380 #endif
381   PetscFunctionReturn(0);
382 }
383 
384 #if !PetscDefined(HAVE_MPIUNI)
385 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
386 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
387 {
388   PetscScalar    *array;
389 
390   PetscFunctionBegin;
391   PetscCall(VecGetArray(v,&array));
392   fftw_free((fftw_complex*)array);
393   PetscCall(VecRestoreArray(v,&array));
394   PetscCall(VecDestroy_MPI(v));
395   PetscFunctionReturn(0);
396 }
397 #endif
398 
399 #if !PetscDefined(HAVE_MPIUNI)
400 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin,Vec *fin_new)
401 {
402   Mat            A;
403 
404   PetscFunctionBegin;
405   PetscCall(PetscObjectQuery((PetscObject)fin,"FFTmatrix",(PetscObject*)&A));
406   PetscCall(MatCreateVecsFFTW_FFTW(A,fin_new,NULL,NULL));
407   PetscFunctionReturn(0);
408 }
409 
410 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout,Vec *fout_new)
411 {
412   Mat            A;
413 
414   PetscFunctionBegin;
415   PetscCall(PetscObjectQuery((PetscObject)fout,"FFTmatrix",(PetscObject*)&A));
416   PetscCall(MatCreateVecsFFTW_FFTW(A,NULL,fout_new,NULL));
417   PetscFunctionReturn(0);
418 }
419 
420 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
421 {
422   Mat            A;
423 
424   PetscFunctionBegin;
425   PetscCall(PetscObjectQuery((PetscObject)bout,"FFTmatrix",(PetscObject*)&A));
426   PetscCall(MatCreateVecsFFTW_FFTW(A,NULL,NULL,bout_new));
427   PetscFunctionReturn(0);
428 }
429 #endif
430 
431 /*@
432    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
433      parallel layout determined by FFTW
434 
435    Collective on Mat
436 
437    Input Parameter:
438 .   A - the matrix
439 
440    Output Parameters:
441 +   x - (optional) input vector of forward FFTW
442 .   y - (optional) output vector of forward FFTW
443 -   z - (optional) output vector of backward FFTW
444 
445   Level: advanced
446 
447   Note: The parallel layout of output of forward FFTW is always same as the input
448         of the backward FFTW. But parallel layout of the input vector of forward
449         FFTW might not be same as the output of backward FFTW.
450         Also note that we need to provide enough space while doing parallel real transform.
451         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
452         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
453         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
454         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
455         zeros if it is odd only one zero is needed.
456         Lastly one needs some scratch space at the end of data set in each process. alloc_local
457         figures out how much space is needed, i.e. it figures out the data+scratch space for
458         each processor and returns that.
459 
460 .seealso: `MatCreateFFT()`
461 @*/
462 PetscErrorCode MatCreateVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
463 {
464   PetscFunctionBegin;
465   PetscUseMethod(A,"MatCreateVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));
466   PetscFunctionReturn(0);
467 }
468 
469 PetscErrorCode  MatCreateVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
470 {
471   PetscMPIInt    size,rank;
472   MPI_Comm       comm;
473   Mat_FFT        *fft  = (Mat_FFT*)A->data;
474 
475   PetscFunctionBegin;
476   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
477   PetscValidType(A,1);
478   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
479 
480   PetscCallMPI(MPI_Comm_size(comm, &size));
481   PetscCallMPI(MPI_Comm_rank(comm, &rank));
482   if (size == 1) { /* sequential case */
483 #if defined(PETSC_USE_COMPLEX)
484     if (fin)  PetscCall(VecCreateSeq(PETSC_COMM_SELF,fft->N,fin));
485     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF,fft->N,fout));
486     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF,fft->N,bout));
487 #else
488     if (fin) PetscCall(VecCreateSeq(PETSC_COMM_SELF,fft->n,fin));
489     if (fout) PetscCall(VecCreateSeq(PETSC_COMM_SELF,fft->n,fout));
490     if (bout) PetscCall(VecCreateSeq(PETSC_COMM_SELF,fft->n,bout));
491 #endif
492 #if !PetscDefined(HAVE_MPIUNI)
493   } else { /* parallel cases */
494     Mat_FFTW     *fftw = (Mat_FFTW*)fft->data;
495     PetscInt     ndim  = fft->ndim,*dim = fft->dim;
496     ptrdiff_t    alloc_local,local_n0,local_0_start;
497     ptrdiff_t    local_n1;
498     fftw_complex *data_fout;
499     ptrdiff_t    local_1_start;
500 #if defined(PETSC_USE_COMPLEX)
501     fftw_complex *data_fin,*data_bout;
502 #else
503     double       *data_finr,*data_boutr;
504     PetscInt     n1,N1;
505     ptrdiff_t    temp;
506 #endif
507 
508     switch (ndim) {
509     case 1:
510 #if !defined(PETSC_USE_COMPLEX)
511       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
512 #else
513       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
514       if (fin) {
515         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
516         PetscCall(VecCreateMPIWithArray(comm,1,local_n0,fft->N,(const PetscScalar*)data_fin,fin));
517         PetscCall(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
518         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
519         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
520       }
521       if (fout) {
522         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
523         PetscCall(VecCreateMPIWithArray(comm,1,local_n1,fft->N,(const PetscScalar*)data_fout,fout));
524         PetscCall(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
525         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
526         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
527       }
528       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
529       if (bout) {
530         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
531         PetscCall(VecCreateMPIWithArray(comm,1,local_n1,fft->N,(const PetscScalar*)data_bout,bout));
532         PetscCall(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
533         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
534         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
535       }
536       break;
537 #endif
538     case 2:
539 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
540       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
541       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
542       if (fin) {
543         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
544         PetscCall(VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin));
545         PetscCall(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
546         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
547         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
548       }
549       if (fout) {
550         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
551         PetscCall(VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout));
552         PetscCall(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
553         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
554         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
555       }
556       if (bout) {
557         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
558         PetscCall(VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout));
559         PetscCall(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
560         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
561         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
562       }
563 #else
564       /* Get local size */
565       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
566       if (fin) {
567         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
568         PetscCall(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fin,fin));
569         PetscCall(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
570         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
571         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
572       }
573       if (fout) {
574         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
575         PetscCall(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fout,fout));
576         PetscCall(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
577         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
578         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
579       }
580       if (bout) {
581         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
582         PetscCall(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_bout,bout));
583         PetscCall(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
584         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
585         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
586       }
587 #endif
588       break;
589     case 3:
590 #if !defined(PETSC_USE_COMPLEX)
591       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
592       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
593       if (fin) {
594         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
595         PetscCall(VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin));
596         PetscCall(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
597         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
598         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
599       }
600       if (fout) {
601         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
602         PetscCall(VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout));
603         PetscCall(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
604         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
605         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
606       }
607       if (bout) {
608         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
609         PetscCall(VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout));
610         PetscCall(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
611         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
612         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
613       }
614 #else
615       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
616       if (fin) {
617         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
618         PetscCall(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fin,fin));
619         PetscCall(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
620         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
621         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
622       }
623       if (fout) {
624         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
625         PetscCall(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fout,fout));
626         PetscCall(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
627         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
628         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
629       }
630       if (bout) {
631         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
632         PetscCall(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_bout,bout));
633         PetscCall(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
634         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
635         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
636       }
637 #endif
638       break;
639     default:
640 #if !defined(PETSC_USE_COMPLEX)
641       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
642       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
643       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
644       N1          = 2*fft->N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
645       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
646 
647       if (fin) {
648         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
649         PetscCall(VecCreateMPIWithArray(comm,1,fft->n,N1,(PetscScalar*)data_finr,fin));
650         PetscCall(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
651         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
652         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
653       }
654       if (fout) {
655         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
656         PetscCall(VecCreateMPIWithArray(comm,1,fft->n,N1,(PetscScalar*)data_fout,fout));
657         PetscCall(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
658         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
659         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
660       }
661       if (bout) {
662         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
663         PetscCall(VecCreateMPIWithArray(comm,1,fft->n,N1,(PetscScalar*)data_boutr,bout));
664         PetscCall(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
665         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
666         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
667       }
668 #else
669       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
670       if (fin) {
671         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
672         PetscCall(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fin,fin));
673         PetscCall(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
674         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
675         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
676       }
677       if (fout) {
678         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
679         PetscCall(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fout,fout));
680         PetscCall(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
681         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
682         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
683       }
684       if (bout) {
685         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
686         PetscCall(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_bout,bout));
687         PetscCall(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
688         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
689         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
690       }
691 #endif
692       break;
693     }
694     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
695        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
696        memory leaks. We void these pointers here to avoid acident uses.
697     */
698     if (fin)  (*fin)->ops->replacearray = NULL;
699     if (fout) (*fout)->ops->replacearray = NULL;
700     if (bout) (*bout)->ops->replacearray = NULL;
701 #endif
702   }
703   PetscFunctionReturn(0);
704 }
705 
706 /*@
707    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
708 
709    Collective on Mat
710 
711    Input Parameters:
712 +  A - FFTW matrix
713 -  x - the PETSc vector
714 
715    Output Parameters:
716 .  y - the FFTW vector
717 
718   Options Database Keys:
719 . -mat_fftw_plannerflags - set FFTW planner flags
720 
721    Level: intermediate
722 
723    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
724          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
725          zeros. This routine does that job by scattering operation.
726 
727 .seealso: `VecScatterFFTWToPetsc()`
728 @*/
729 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
730 {
731   PetscFunctionBegin;
732   PetscUseMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));
733   PetscFunctionReturn(0);
734 }
735 
736 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
737 {
738   MPI_Comm       comm;
739   Mat_FFT        *fft  = (Mat_FFT*)A->data;
740   PetscInt       low;
741   PetscMPIInt    rank,size;
742   PetscInt       vsize,vsize1;
743   VecScatter     vecscat;
744   IS             list1;
745 
746   PetscFunctionBegin;
747   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
748   PetscCallMPI(MPI_Comm_size(comm, &size));
749   PetscCallMPI(MPI_Comm_rank(comm, &rank));
750   PetscCall(VecGetOwnershipRange(y,&low,NULL));
751 
752   if (size==1) {
753     PetscCall(VecGetSize(x,&vsize));
754     PetscCall(VecGetSize(y,&vsize1));
755     PetscCall(ISCreateStride(PETSC_COMM_SELF,fft->N,0,1,&list1));
756     PetscCall(VecScatterCreate(x,list1,y,list1,&vecscat));
757     PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
758     PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
759     PetscCall(VecScatterDestroy(&vecscat));
760     PetscCall(ISDestroy(&list1));
761 #if !PetscDefined(HAVE_MPIUNI)
762   } else {
763     Mat_FFTW   *fftw = (Mat_FFTW*)fft->data;
764     PetscInt   ndim  = fft->ndim,*dim = fft->dim;
765     ptrdiff_t  local_n0,local_0_start;
766     ptrdiff_t  local_n1,local_1_start;
767     IS         list2;
768 #if !defined(PETSC_USE_COMPLEX)
769     PetscInt   i,j,k,partial_dim;
770     PetscInt   *indx1, *indx2, tempindx, tempindx1;
771     PetscInt   NM;
772     ptrdiff_t  temp;
773 #endif
774 
775     switch (ndim) {
776     case 1:
777 #if defined(PETSC_USE_COMPLEX)
778       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
779 
780       PetscCall(ISCreateStride(comm,local_n0,local_0_start,1,&list1));
781       PetscCall(ISCreateStride(comm,local_n0,low,1,&list2));
782       PetscCall(VecScatterCreate(x,list1,y,list2,&vecscat));
783       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
784       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
785       PetscCall(VecScatterDestroy(&vecscat));
786       PetscCall(ISDestroy(&list1));
787       PetscCall(ISDestroy(&list2));
788 #else
789       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
790 #endif
791       break;
792     case 2:
793 #if defined(PETSC_USE_COMPLEX)
794       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
795 
796       PetscCall(ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1));
797       PetscCall(ISCreateStride(comm,local_n0*dim[1],low,1,&list2));
798       PetscCall(VecScatterCreate(x,list1,y,list2,&vecscat));
799       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
800       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
801       PetscCall(VecScatterDestroy(&vecscat));
802       PetscCall(ISDestroy(&list1));
803       PetscCall(ISDestroy(&list2));
804 #else
805       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
806 
807       PetscCall(PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1));
808       PetscCall(PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2));
809 
810       if (dim[1]%2==0) {
811         NM = dim[1]+2;
812       } else {
813         NM = dim[1]+1;
814       }
815       for (i=0; i<local_n0; i++) {
816         for (j=0; j<dim[1]; j++) {
817           tempindx  = i*dim[1] + j;
818           tempindx1 = i*NM + j;
819 
820           indx1[tempindx]=local_0_start*dim[1]+tempindx;
821           indx2[tempindx]=low+tempindx1;
822         }
823       }
824 
825       PetscCall(ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1));
826       PetscCall(ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2));
827 
828       PetscCall(VecScatterCreate(x,list1,y,list2,&vecscat));
829       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
830       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
831       PetscCall(VecScatterDestroy(&vecscat));
832       PetscCall(ISDestroy(&list1));
833       PetscCall(ISDestroy(&list2));
834       PetscCall(PetscFree(indx1));
835       PetscCall(PetscFree(indx2));
836 #endif
837       break;
838 
839     case 3:
840 #if defined(PETSC_USE_COMPLEX)
841       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
842 
843       PetscCall(ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1));
844       PetscCall(ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2));
845       PetscCall(VecScatterCreate(x,list1,y,list2,&vecscat));
846       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
847       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
848       PetscCall(VecScatterDestroy(&vecscat));
849       PetscCall(ISDestroy(&list1));
850       PetscCall(ISDestroy(&list2));
851 #else
852       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
853       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
854       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
855 
856       PetscCall(PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1));
857       PetscCall(PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2));
858 
859       if (dim[2]%2==0) NM = dim[2]+2;
860       else             NM = dim[2]+1;
861 
862       for (i=0; i<local_n0; i++) {
863         for (j=0; j<dim[1]; j++) {
864           for (k=0; k<dim[2]; k++) {
865             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
866             tempindx1 = i*dim[1]*NM + j*NM + k;
867 
868             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
869             indx2[tempindx]=low+tempindx1;
870           }
871         }
872       }
873 
874       PetscCall(ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1));
875       PetscCall(ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2));
876       PetscCall(VecScatterCreate(x,list1,y,list2,&vecscat));
877       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
878       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
879       PetscCall(VecScatterDestroy(&vecscat));
880       PetscCall(ISDestroy(&list1));
881       PetscCall(ISDestroy(&list2));
882       PetscCall(PetscFree(indx1));
883       PetscCall(PetscFree(indx2));
884 #endif
885       break;
886 
887     default:
888 #if defined(PETSC_USE_COMPLEX)
889       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
890 
891       PetscCall(ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1));
892       PetscCall(ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2));
893       PetscCall(VecScatterCreate(x,list1,y,list2,&vecscat));
894       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
895       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
896       PetscCall(VecScatterDestroy(&vecscat));
897       PetscCall(ISDestroy(&list1));
898       PetscCall(ISDestroy(&list2));
899 #else
900       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
901       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
902       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
903 
904       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
905 
906       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
907 
908       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
909 
910       partial_dim = fftw->partial_dim;
911 
912       PetscCall(PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1));
913       PetscCall(PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2));
914 
915       if (dim[ndim-1]%2==0) NM = 2;
916       else                  NM = 1;
917 
918       j = low;
919       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
920         indx1[i] = local_0_start*partial_dim + i;
921         indx2[i] = j;
922         if (k%dim[ndim-1]==0) j+=NM;
923         j++;
924       }
925       PetscCall(ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1));
926       PetscCall(ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2));
927       PetscCall(VecScatterCreate(x,list1,y,list2,&vecscat));
928       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
929       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
930       PetscCall(VecScatterDestroy(&vecscat));
931       PetscCall(ISDestroy(&list1));
932       PetscCall(ISDestroy(&list2));
933       PetscCall(PetscFree(indx1));
934       PetscCall(PetscFree(indx2));
935 #endif
936       break;
937     }
938 #endif
939   }
940   PetscFunctionReturn(0);
941 }
942 
943 /*@
944    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
945 
946    Collective on Mat
947 
948     Input Parameters:
949 +   A - FFTW matrix
950 -   x - FFTW vector
951 
952    Output Parameters:
953 .  y - PETSc vector
954 
955    Level: intermediate
956 
957    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
958          VecScatterFFTWToPetsc removes those extra zeros.
959 
960 .seealso: `VecScatterPetscToFFTW()`
961 @*/
962 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
963 {
964   PetscFunctionBegin;
965   PetscUseMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));
966   PetscFunctionReturn(0);
967 }
968 
969 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
970 {
971   MPI_Comm       comm;
972   Mat_FFT        *fft  = (Mat_FFT*)A->data;
973   PetscInt       low;
974   PetscMPIInt    rank,size;
975   VecScatter     vecscat;
976   IS             list1;
977 
978   PetscFunctionBegin;
979   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
980   PetscCallMPI(MPI_Comm_size(comm, &size));
981   PetscCallMPI(MPI_Comm_rank(comm, &rank));
982   PetscCall(VecGetOwnershipRange(x,&low,NULL));
983 
984   if (size==1) {
985     PetscCall(ISCreateStride(comm,fft->N,0,1,&list1));
986     PetscCall(VecScatterCreate(x,list1,y,list1,&vecscat));
987     PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
988     PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
989     PetscCall(VecScatterDestroy(&vecscat));
990     PetscCall(ISDestroy(&list1));
991 
992 #if !PetscDefined(HAVE_MPIUNI)
993   } else {
994     Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
995     PetscInt       ndim  = fft->ndim,*dim = fft->dim;
996     ptrdiff_t      local_n0,local_0_start;
997     ptrdiff_t      local_n1,local_1_start;
998     IS             list2;
999 #if !defined(PETSC_USE_COMPLEX)
1000     PetscInt       i,j,k,partial_dim;
1001     PetscInt       *indx1, *indx2, tempindx, tempindx1;
1002     PetscInt       NM;
1003     ptrdiff_t      temp;
1004 #endif
1005     switch (ndim) {
1006     case 1:
1007 #if defined(PETSC_USE_COMPLEX)
1008       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1009 
1010       PetscCall(ISCreateStride(comm,local_n1,local_1_start,1,&list1));
1011       PetscCall(ISCreateStride(comm,local_n1,low,1,&list2));
1012       PetscCall(VecScatterCreate(x,list1,y,list2,&vecscat));
1013       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1014       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1015       PetscCall(VecScatterDestroy(&vecscat));
1016       PetscCall(ISDestroy(&list1));
1017       PetscCall(ISDestroy(&list2));
1018 #else
1019       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
1020 #endif
1021       break;
1022     case 2:
1023 #if defined(PETSC_USE_COMPLEX)
1024       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1025 
1026       PetscCall(ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1));
1027       PetscCall(ISCreateStride(comm,local_n0*dim[1],low,1,&list2));
1028       PetscCall(VecScatterCreate(x,list2,y,list1,&vecscat));
1029       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1030       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1031       PetscCall(VecScatterDestroy(&vecscat));
1032       PetscCall(ISDestroy(&list1));
1033       PetscCall(ISDestroy(&list2));
1034 #else
1035       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1036 
1037       PetscCall(PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1));
1038       PetscCall(PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2));
1039 
1040       if (dim[1]%2==0) NM = dim[1]+2;
1041       else             NM = dim[1]+1;
1042 
1043       for (i=0; i<local_n0; i++) {
1044         for (j=0; j<dim[1]; j++) {
1045           tempindx = i*dim[1] + j;
1046           tempindx1 = i*NM + j;
1047 
1048           indx1[tempindx]=local_0_start*dim[1]+tempindx;
1049           indx2[tempindx]=low+tempindx1;
1050         }
1051       }
1052 
1053       PetscCall(ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1));
1054       PetscCall(ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2));
1055 
1056       PetscCall(VecScatterCreate(x,list2,y,list1,&vecscat));
1057       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1058       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1059       PetscCall(VecScatterDestroy(&vecscat));
1060       PetscCall(ISDestroy(&list1));
1061       PetscCall(ISDestroy(&list2));
1062       PetscCall(PetscFree(indx1));
1063       PetscCall(PetscFree(indx2));
1064 #endif
1065       break;
1066     case 3:
1067 #if defined(PETSC_USE_COMPLEX)
1068       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1069 
1070       PetscCall(ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1));
1071       PetscCall(ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2));
1072       PetscCall(VecScatterCreate(x,list1,y,list2,&vecscat));
1073       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1074       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1075       PetscCall(VecScatterDestroy(&vecscat));
1076       PetscCall(ISDestroy(&list1));
1077       PetscCall(ISDestroy(&list2));
1078 #else
1079       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1080 
1081       PetscCall(PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1));
1082       PetscCall(PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2));
1083 
1084       if (dim[2]%2==0) NM = dim[2]+2;
1085       else             NM = dim[2]+1;
1086 
1087       for (i=0; i<local_n0; i++) {
1088         for (j=0; j<dim[1]; j++) {
1089           for (k=0; k<dim[2]; k++) {
1090             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1091             tempindx1 = i*dim[1]*NM + j*NM + k;
1092 
1093             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1094             indx2[tempindx]=low+tempindx1;
1095           }
1096         }
1097       }
1098 
1099       PetscCall(ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1));
1100       PetscCall(ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2));
1101 
1102       PetscCall(VecScatterCreate(x,list2,y,list1,&vecscat));
1103       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1104       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1105       PetscCall(VecScatterDestroy(&vecscat));
1106       PetscCall(ISDestroy(&list1));
1107       PetscCall(ISDestroy(&list2));
1108       PetscCall(PetscFree(indx1));
1109       PetscCall(PetscFree(indx2));
1110 #endif
1111       break;
1112     default:
1113 #if defined(PETSC_USE_COMPLEX)
1114       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
1115 
1116       PetscCall(ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1));
1117       PetscCall(ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2));
1118       PetscCall(VecScatterCreate(x,list1,y,list2,&vecscat));
1119       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1120       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1121       PetscCall(VecScatterDestroy(&vecscat));
1122       PetscCall(ISDestroy(&list1));
1123       PetscCall(ISDestroy(&list2));
1124 #else
1125       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
1126 
1127       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
1128 
1129       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1130 
1131       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1132 
1133       partial_dim = fftw->partial_dim;
1134 
1135       PetscCall(PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1));
1136       PetscCall(PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2));
1137 
1138       if (dim[ndim-1]%2==0) NM = 2;
1139       else                  NM = 1;
1140 
1141       j = low;
1142       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1143         indx1[i] = local_0_start*partial_dim + i;
1144         indx2[i] = j;
1145         if (k%dim[ndim-1]==0) j+=NM;
1146         j++;
1147       }
1148       PetscCall(ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1));
1149       PetscCall(ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2));
1150 
1151       PetscCall(VecScatterCreate(x,list2,y,list1,&vecscat));
1152       PetscCall(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1153       PetscCall(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1154       PetscCall(VecScatterDestroy(&vecscat));
1155       PetscCall(ISDestroy(&list1));
1156       PetscCall(ISDestroy(&list2));
1157       PetscCall(PetscFree(indx1));
1158       PetscCall(PetscFree(indx2));
1159 #endif
1160       break;
1161     }
1162 #endif
1163   }
1164   PetscFunctionReturn(0);
1165 }
1166 
1167 /*
1168     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1169 
1170   Options Database Keys:
1171 + -mat_fftw_plannerflags - set FFTW planner flags
1172 
1173    Level: intermediate
1174 
1175 */
1176 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1177 {
1178   MPI_Comm       comm;
1179   Mat_FFT        *fft = (Mat_FFT*)A->data;
1180   Mat_FFTW       *fftw;
1181   PetscInt       ndim = fft->ndim,*dim = fft->dim;
1182   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1183   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1184   PetscBool      flg;
1185   PetscInt       p_flag,partial_dim=1,ctr;
1186   PetscMPIInt    size,rank;
1187   ptrdiff_t      *pdim;
1188 #if !defined(PETSC_USE_COMPLEX)
1189   PetscInt       tot_dim;
1190 #endif
1191 
1192   PetscFunctionBegin;
1193   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1194   PetscCallMPI(MPI_Comm_size(comm, &size));
1195   PetscCallMPI(MPI_Comm_rank(comm, &rank));
1196 
1197 #if !PetscDefined(HAVE_MPIUNI)
1198   fftw_mpi_init();
1199 #endif
1200   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1201   pdim[0] = dim[0];
1202 #if !defined(PETSC_USE_COMPLEX)
1203   tot_dim = 2*dim[0];
1204 #endif
1205   for (ctr=1; ctr<ndim; ctr++) {
1206     partial_dim *= dim[ctr];
1207     pdim[ctr]    = dim[ctr];
1208 #if !defined(PETSC_USE_COMPLEX)
1209     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1210     else             tot_dim *= dim[ctr];
1211 #endif
1212   }
1213 
1214   if (size == 1) {
1215 #if defined(PETSC_USE_COMPLEX)
1216     PetscCall(MatSetSizes(A,fft->N,fft->N,fft->N,fft->N));
1217     fft->n = fft->N;
1218 #else
1219     PetscCall(MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim));
1220     fft->n = tot_dim;
1221 #endif
1222 #if !PetscDefined(HAVE_MPIUNI)
1223   } else {
1224     ptrdiff_t local_n0,local_0_start,local_n1,local_1_start;
1225 #if !defined(PETSC_USE_COMPLEX)
1226     ptrdiff_t temp;
1227     PetscInt  N1;
1228 #endif
1229 
1230     switch (ndim) {
1231     case 1:
1232 #if !defined(PETSC_USE_COMPLEX)
1233       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1234 #else
1235       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1236       fft->n = (PetscInt)local_n0;
1237       PetscCall(MatSetSizes(A,local_n1,fft->n,fft->N,fft->N));
1238 #endif
1239       break;
1240     case 2:
1241 #if defined(PETSC_USE_COMPLEX)
1242       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1243       fft->n    = (PetscInt)local_n0*dim[1];
1244       PetscCall(MatSetSizes(A,fft->n,fft->n,fft->N,fft->N));
1245 #else
1246       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1247 
1248       fft->n = 2*(PetscInt)local_n0*(dim[1]/2+1);
1249       PetscCall(MatSetSizes(A,fft->n,fft->n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1)));
1250 #endif
1251       break;
1252     case 3:
1253 #if defined(PETSC_USE_COMPLEX)
1254       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1255 
1256       fft->n = (PetscInt)local_n0*dim[1]*dim[2];
1257       PetscCall(MatSetSizes(A,fft->n,fft->n,fft->N,fft->N));
1258 #else
1259       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1260 
1261       fft->n = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1262       PetscCall(MatSetSizes(A,fft->n,fft->n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1)));
1263 #endif
1264       break;
1265     default:
1266 #if defined(PETSC_USE_COMPLEX)
1267       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1268 
1269       fft->n = (PetscInt)local_n0*partial_dim;
1270       PetscCall(MatSetSizes(A,fft->n,fft->n,fft->N,fft->N));
1271 #else
1272       temp = pdim[ndim-1];
1273 
1274       pdim[ndim-1] = temp/2 + 1;
1275 
1276       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1277 
1278       fft->n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1279       N1 = 2*fft->N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1280 
1281       pdim[ndim-1] = temp;
1282 
1283       PetscCall(MatSetSizes(A,fft->n,fft->n,N1,N1));
1284 #endif
1285       break;
1286     }
1287 #endif
1288   }
1289   free(pdim);
1290   PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATFFTW));
1291   PetscCall(PetscNewLog(A,&fftw));
1292   fft->data = (void*)fftw;
1293 
1294   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1295   fftw->partial_dim = partial_dim;
1296 
1297   PetscCall(PetscMalloc1(ndim, &(fftw->dim_fftw)));
1298   if (size == 1) {
1299 #if defined(PETSC_USE_64BIT_INDICES)
1300     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1301 #else
1302     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1303 #endif
1304   }
1305 
1306   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1307 
1308   fftw->p_forward  = NULL;
1309   fftw->p_backward = NULL;
1310   fftw->p_flag     = FFTW_ESTIMATE;
1311 
1312   if (size == 1) {
1313     A->ops->mult          = MatMult_SeqFFTW;
1314     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1315 #if !PetscDefined(HAVE_MPIUNI)
1316   } else {
1317     A->ops->mult          = MatMult_MPIFFTW;
1318     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1319 #endif
1320   }
1321   fft->matdestroy = MatDestroy_FFTW;
1322   A->assembled    = PETSC_TRUE;
1323   A->preallocated = PETSC_TRUE;
1324 
1325   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",MatCreateVecsFFTW_FFTW));
1326   PetscCall(PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW));
1327   PetscCall(PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW));
1328 
1329   /* get runtime options */
1330   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");
1331   PetscCall(PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg));
1332   if (flg) fftw->p_flag = iplans[p_flag];
1333   PetscOptionsEnd();
1334   PetscFunctionReturn(0);
1335 }
1336