xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 40badf4fbc550ac1f60bd080eaff6de6d55b946d)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #if !PetscDefined(HAVE_MPIUNI)
10 #include <fftw3-mpi.h>
11 #else
12 #include <fftw3.h>
13 #endif
14 EXTERN_C_END
15 
16 typedef struct {
17   ptrdiff_t    ndim_fftw,*dim_fftw;
18 #if defined(PETSC_USE_64BIT_INDICES)
19   fftw_iodim64 *iodims;
20 #else
21   fftw_iodim   *iodims;
22 #endif
23   PetscInt     partial_dim;
24   fftw_plan    p_forward,p_backward;
25   unsigned     p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
26   PetscScalar  *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
27                                                             executed for the arrays with which the plan was created */
28 } Mat_FFTW;
29 
30 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
31 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
32 extern PetscErrorCode MatDestroy_FFTW(Mat);
33 extern PetscErrorCode MatCreateVecsFFTW_FFTW(Mat,Vec*,Vec*,Vec*);
34 #if !PetscDefined(HAVE_MPIUNI)
35 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
36 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
37 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
38 #endif
39 
40 /*
41    MatMult_SeqFFTW performs forward DFT
42    Input parameter:
43      A - the matrix
44      x - the vector on which FDFT will be performed
45 
46    Output parameter:
47      y - vector that stores result of FDFT
48 */
49 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
50 {
51   Mat_FFT        *fft  = (Mat_FFT*)A->data;
52   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
53   const PetscScalar *x_array;
54   PetscScalar    *y_array;
55 #if defined(PETSC_USE_COMPLEX)
56 #if defined(PETSC_USE_64BIT_INDICES)
57   fftw_iodim64   *iodims;
58 #else
59   fftw_iodim     *iodims;
60 #endif
61   PetscInt       i;
62 #endif
63   PetscInt       ndim = fft->ndim,*dim = fft->dim;
64 
65   PetscFunctionBegin;
66   CHKERRQ(VecGetArrayRead(x,&x_array));
67   CHKERRQ(VecGetArray(y,&y_array));
68   if (!fftw->p_forward) { /* create a plan, then excute it */
69     switch (ndim) {
70     case 1:
71 #if defined(PETSC_USE_COMPLEX)
72       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
73 #else
74       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
75 #endif
76       break;
77     case 2:
78 #if defined(PETSC_USE_COMPLEX)
79       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
80 #else
81       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
82 #endif
83       break;
84     case 3:
85 #if defined(PETSC_USE_COMPLEX)
86       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
87 #else
88       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
89 #endif
90       break;
91     default:
92 #if defined(PETSC_USE_COMPLEX)
93       iodims = fftw->iodims;
94 #if defined(PETSC_USE_64BIT_INDICES)
95       if (ndim) {
96         iodims[ndim-1].n = (ptrdiff_t)dim[ndim-1];
97         iodims[ndim-1].is = iodims[ndim-1].os = 1;
98         for (i=ndim-2; i>=0; --i) {
99           iodims[i].n = (ptrdiff_t)dim[i];
100           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
101         }
102       }
103       fftw->p_forward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
104 #else
105       if (ndim) {
106         iodims[ndim-1].n = (int)dim[ndim-1];
107         iodims[ndim-1].is = iodims[ndim-1].os = 1;
108         for (i=ndim-2; i>=0; --i) {
109           iodims[i].n = (int)dim[i];
110           iodims[i].is = iodims[i].os = iodims[i+1].is * iodims[i+1].n;
111         }
112       }
113       fftw->p_forward = fftw_plan_guru_dft((int)ndim,(fftw_iodim*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
114 #endif
115 
116 #else
117       fftw->p_forward = fftw_plan_dft_r2c(ndim,(int*)dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
118 #endif
119       break;
120     }
121     fftw->finarray  = (PetscScalar *) x_array;
122     fftw->foutarray = y_array;
123     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
124                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
125     fftw_execute(fftw->p_forward);
126   } else { /* use existing plan */
127     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
128 #if defined(PETSC_USE_COMPLEX)
129       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
130 #else
131       fftw_execute_dft_r2c(fftw->p_forward,(double*)x_array,(fftw_complex*)y_array);
132 #endif
133     } else {
134       fftw_execute(fftw->p_forward);
135     }
136   }
137   CHKERRQ(VecRestoreArray(y,&y_array));
138   CHKERRQ(VecRestoreArrayRead(x,&x_array));
139   PetscFunctionReturn(0);
140 }
141 
142 /* MatMultTranspose_SeqFFTW performs serial backward DFT
143    Input parameter:
144      A - the matrix
145      x - the vector on which BDFT will be performed
146 
147    Output parameter:
148      y - vector that stores result of BDFT
149 */
150 
151 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
152 {
153   Mat_FFT        *fft  = (Mat_FFT*)A->data;
154   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
155   const PetscScalar *x_array;
156   PetscScalar    *y_array;
157   PetscInt       ndim = fft->ndim,*dim = fft->dim;
158 #if defined(PETSC_USE_COMPLEX)
159 #if defined(PETSC_USE_64BIT_INDICES)
160   fftw_iodim64   *iodims = fftw->iodims;
161 #else
162   fftw_iodim     *iodims = fftw->iodims;
163 #endif
164 #endif
165 
166   PetscFunctionBegin;
167   CHKERRQ(VecGetArrayRead(x,&x_array));
168   CHKERRQ(VecGetArray(y,&y_array));
169   if (!fftw->p_backward) { /* create a plan, then excute it */
170     switch (ndim) {
171     case 1:
172 #if defined(PETSC_USE_COMPLEX)
173       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
174 #else
175       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
176 #endif
177       break;
178     case 2:
179 #if defined(PETSC_USE_COMPLEX)
180       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
181 #else
182       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
183 #endif
184       break;
185     case 3:
186 #if defined(PETSC_USE_COMPLEX)
187       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
188 #else
189       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
190 #endif
191       break;
192     default:
193 #if defined(PETSC_USE_COMPLEX)
194 #if defined(PETSC_USE_64BIT_INDICES)
195       fftw->p_backward = fftw_plan_guru64_dft((int)ndim,(fftw_iodim64*)iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
196 #else
197       fftw->p_backward = fftw_plan_guru_dft((int)ndim,iodims,0,NULL,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
198 #endif
199 #else
200       fftw->p_backward= fftw_plan_dft_c2r((int)ndim,(int*)dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
201 #endif
202       break;
203     }
204     fftw->binarray  = (PetscScalar *) x_array;
205     fftw->boutarray = y_array;
206     fftw_execute(fftw->p_backward);
207   } else { /* use existing plan */
208     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
209 #if defined(PETSC_USE_COMPLEX)
210       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
211 #else
212       fftw_execute_dft_c2r(fftw->p_backward,(fftw_complex*)x_array,(double*)y_array);
213 #endif
214     } else {
215       fftw_execute(fftw->p_backward);
216     }
217   }
218   CHKERRQ(VecRestoreArray(y,&y_array));
219   CHKERRQ(VecRestoreArrayRead(x,&x_array));
220   PetscFunctionReturn(0);
221 }
222 
223 #if !PetscDefined(HAVE_MPIUNI)
224 /* MatMult_MPIFFTW performs forward DFT in parallel
225    Input parameter:
226      A - the matrix
227      x - the vector on which FDFT will be performed
228 
229    Output parameter:
230    y   - vector that stores result of FDFT
231 */
232 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
233 {
234   Mat_FFT        *fft  = (Mat_FFT*)A->data;
235   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
236   const PetscScalar *x_array;
237   PetscScalar    *y_array;
238   PetscInt       ndim = fft->ndim,*dim = fft->dim;
239   MPI_Comm       comm;
240 
241   PetscFunctionBegin;
242   CHKERRQ(PetscObjectGetComm((PetscObject)A,&comm));
243   CHKERRQ(VecGetArrayRead(x,&x_array));
244   CHKERRQ(VecGetArray(y,&y_array));
245   if (!fftw->p_forward) { /* create a plan, then excute it */
246     switch (ndim) {
247     case 1:
248 #if defined(PETSC_USE_COMPLEX)
249       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
250 #else
251       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
252 #endif
253       break;
254     case 2:
255 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
256       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
257 #else
258       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
259 #endif
260       break;
261     case 3:
262 #if defined(PETSC_USE_COMPLEX)
263       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
264 #else
265       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
266 #endif
267       break;
268     default:
269 #if defined(PETSC_USE_COMPLEX)
270       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
271 #else
272       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
273 #endif
274       break;
275     }
276     fftw->finarray  = (PetscScalar *) x_array;
277     fftw->foutarray = y_array;
278     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
279                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
280     fftw_execute(fftw->p_forward);
281   } else { /* use existing plan */
282     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
283       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
284     } else {
285       fftw_execute(fftw->p_forward);
286     }
287   }
288   CHKERRQ(VecRestoreArray(y,&y_array));
289   CHKERRQ(VecRestoreArrayRead(x,&x_array));
290   PetscFunctionReturn(0);
291 }
292 
293 /*
294    MatMultTranspose_MPIFFTW performs parallel backward DFT
295    Input parameter:
296      A - the matrix
297      x - the vector on which BDFT will be performed
298 
299    Output parameter:
300      y - vector that stores result of BDFT
301 */
302 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
303 {
304   Mat_FFT        *fft  = (Mat_FFT*)A->data;
305   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
306   const PetscScalar *x_array;
307   PetscScalar    *y_array;
308   PetscInt       ndim = fft->ndim,*dim = fft->dim;
309   MPI_Comm       comm;
310 
311   PetscFunctionBegin;
312   CHKERRQ(PetscObjectGetComm((PetscObject)A,&comm));
313   CHKERRQ(VecGetArrayRead(x,&x_array));
314   CHKERRQ(VecGetArray(y,&y_array));
315   if (!fftw->p_backward) { /* create a plan, then excute it */
316     switch (ndim) {
317     case 1:
318 #if defined(PETSC_USE_COMPLEX)
319       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
320 #else
321       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
322 #endif
323       break;
324     case 2:
325 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
326       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
327 #else
328       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
329 #endif
330       break;
331     case 3:
332 #if defined(PETSC_USE_COMPLEX)
333       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
334 #else
335       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
336 #endif
337       break;
338     default:
339 #if defined(PETSC_USE_COMPLEX)
340       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
341 #else
342       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
343 #endif
344       break;
345     }
346     fftw->binarray  = (PetscScalar *) x_array;
347     fftw->boutarray = y_array;
348     fftw_execute(fftw->p_backward);
349   } else { /* use existing plan */
350     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
351       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
352     } else {
353       fftw_execute(fftw->p_backward);
354     }
355   }
356   CHKERRQ(VecRestoreArray(y,&y_array));
357   CHKERRQ(VecRestoreArrayRead(x,&x_array));
358   PetscFunctionReturn(0);
359 }
360 #endif
361 
362 PetscErrorCode MatDestroy_FFTW(Mat A)
363 {
364   Mat_FFT        *fft  = (Mat_FFT*)A->data;
365   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
366 
367   PetscFunctionBegin;
368   fftw_destroy_plan(fftw->p_forward);
369   fftw_destroy_plan(fftw->p_backward);
370   if (fftw->iodims) {
371     free(fftw->iodims);
372   }
373   CHKERRQ(PetscFree(fftw->dim_fftw));
374   CHKERRQ(PetscFree(fft->data));
375 #if !PetscDefined(HAVE_MPIUNI)
376   fftw_mpi_cleanup();
377 #endif
378   PetscFunctionReturn(0);
379 }
380 
381 #if !PetscDefined(HAVE_MPIUNI)
382 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
383 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
384 {
385   PetscScalar    *array;
386 
387   PetscFunctionBegin;
388   CHKERRQ(VecGetArray(v,&array));
389   fftw_free((fftw_complex*)array);
390   CHKERRQ(VecRestoreArray(v,&array));
391   CHKERRQ(VecDestroy_MPI(v));
392   PetscFunctionReturn(0);
393 }
394 #endif
395 
396 #if !PetscDefined(HAVE_MPIUNI)
397 static PetscErrorCode VecDuplicate_FFTW_fin(Vec fin,Vec *fin_new)
398 {
399   Mat            A;
400 
401   PetscFunctionBegin;
402   CHKERRQ(PetscObjectQuery((PetscObject)fin,"FFTmatrix",(PetscObject*)&A));
403   CHKERRQ(MatCreateVecsFFTW_FFTW(A,fin_new,NULL,NULL));
404   PetscFunctionReturn(0);
405 }
406 
407 static PetscErrorCode VecDuplicate_FFTW_fout(Vec fout,Vec *fout_new)
408 {
409   Mat            A;
410 
411   PetscFunctionBegin;
412   CHKERRQ(PetscObjectQuery((PetscObject)fout,"FFTmatrix",(PetscObject*)&A));
413   CHKERRQ(MatCreateVecsFFTW_FFTW(A,NULL,fout_new,NULL));
414   PetscFunctionReturn(0);
415 }
416 
417 static PetscErrorCode VecDuplicate_FFTW_bout(Vec bout, Vec *bout_new)
418 {
419   Mat            A;
420 
421   PetscFunctionBegin;
422   CHKERRQ(PetscObjectQuery((PetscObject)bout,"FFTmatrix",(PetscObject*)&A));
423   CHKERRQ(MatCreateVecsFFTW_FFTW(A,NULL,NULL,bout_new));
424   PetscFunctionReturn(0);
425 }
426 #endif
427 
428 /*@
429    MatCreateVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
430      parallel layout determined by FFTW
431 
432    Collective on Mat
433 
434    Input Parameter:
435 .   A - the matrix
436 
437    Output Parameters:
438 +   x - (optional) input vector of forward FFTW
439 .   y - (optional) output vector of forward FFTW
440 -   z - (optional) output vector of backward FFTW
441 
442   Level: advanced
443 
444   Note: The parallel layout of output of forward FFTW is always same as the input
445         of the backward FFTW. But parallel layout of the input vector of forward
446         FFTW might not be same as the output of backward FFTW.
447         Also note that we need to provide enough space while doing parallel real transform.
448         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
449         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
450         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
451         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
452         zeros if it is odd only one zero is needed.
453         Lastly one needs some scratch space at the end of data set in each process. alloc_local
454         figures out how much space is needed, i.e. it figures out the data+scratch space for
455         each processor and returns that.
456 
457 .seealso: MatCreateFFT()
458 @*/
459 PetscErrorCode MatCreateVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
460 {
461   PetscFunctionBegin;
462   CHKERRQ(PetscUseMethod(A,"MatCreateVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z)));
463   PetscFunctionReturn(0);
464 }
465 
466 PetscErrorCode  MatCreateVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
467 {
468   PetscMPIInt    size,rank;
469   MPI_Comm       comm;
470   Mat_FFT        *fft  = (Mat_FFT*)A->data;
471 
472   PetscFunctionBegin;
473   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
474   PetscValidType(A,1);
475   CHKERRQ(PetscObjectGetComm((PetscObject)A,&comm));
476 
477   CHKERRMPI(MPI_Comm_size(comm, &size));
478   CHKERRMPI(MPI_Comm_rank(comm, &rank));
479   if (size == 1) { /* sequential case */
480 #if defined(PETSC_USE_COMPLEX)
481     if (fin)  CHKERRQ(VecCreateSeq(PETSC_COMM_SELF,fft->N,fin));
482     if (fout) CHKERRQ(VecCreateSeq(PETSC_COMM_SELF,fft->N,fout));
483     if (bout) CHKERRQ(VecCreateSeq(PETSC_COMM_SELF,fft->N,bout));
484 #else
485     if (fin) CHKERRQ(VecCreateSeq(PETSC_COMM_SELF,fft->n,fin));
486     if (fout) CHKERRQ(VecCreateSeq(PETSC_COMM_SELF,fft->n,fout));
487     if (bout) CHKERRQ(VecCreateSeq(PETSC_COMM_SELF,fft->n,bout));
488 #endif
489 #if !PetscDefined(HAVE_MPIUNI)
490   } else { /* parallel cases */
491     Mat_FFTW     *fftw = (Mat_FFTW*)fft->data;
492     PetscInt     ndim  = fft->ndim,*dim = fft->dim;
493     ptrdiff_t    alloc_local,local_n0,local_0_start;
494     ptrdiff_t    local_n1;
495     fftw_complex *data_fout;
496     ptrdiff_t    local_1_start;
497 #if defined(PETSC_USE_COMPLEX)
498     fftw_complex *data_fin,*data_bout;
499 #else
500     double       *data_finr,*data_boutr;
501     PetscInt     n1,N1;
502     ptrdiff_t    temp;
503 #endif
504 
505     switch (ndim) {
506     case 1:
507 #if !defined(PETSC_USE_COMPLEX)
508       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
509 #else
510       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
511       if (fin) {
512         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
513         CHKERRQ(VecCreateMPIWithArray(comm,1,local_n0,fft->N,(const PetscScalar*)data_fin,fin));
514         CHKERRQ(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
515         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
516         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
517       }
518       if (fout) {
519         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
520         CHKERRQ(VecCreateMPIWithArray(comm,1,local_n1,fft->N,(const PetscScalar*)data_fout,fout));
521         CHKERRQ(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
522         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
523         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
524       }
525       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
526       if (bout) {
527         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
528         CHKERRQ(VecCreateMPIWithArray(comm,1,local_n1,fft->N,(const PetscScalar*)data_bout,bout));
529         CHKERRQ(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
530         (*bout)->ops->duplicate = VecDuplicate_FFTW_fout;
531         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
532       }
533       break;
534 #endif
535     case 2:
536 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
537       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
538       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
539       if (fin) {
540         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
541         CHKERRQ(VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin));
542         CHKERRQ(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
543         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
544         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
545       }
546       if (fout) {
547         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
548         CHKERRQ(VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout));
549         CHKERRQ(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
550         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
551         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
552       }
553       if (bout) {
554         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
555         CHKERRQ(VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout));
556         CHKERRQ(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
557         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
558         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
559       }
560 #else
561       /* Get local size */
562       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
563       if (fin) {
564         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
565         CHKERRQ(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fin,fin));
566         CHKERRQ(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
567         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
568         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
569       }
570       if (fout) {
571         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
572         CHKERRQ(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fout,fout));
573         CHKERRQ(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
574         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
575         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
576       }
577       if (bout) {
578         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
579         CHKERRQ(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_bout,bout));
580         CHKERRQ(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
581         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
582         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
583       }
584 #endif
585       break;
586     case 3:
587 #if !defined(PETSC_USE_COMPLEX)
588       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
589       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
590       if (fin) {
591         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
592         CHKERRQ(VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin));
593         CHKERRQ(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
594         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
595         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
596       }
597       if (fout) {
598         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
599         CHKERRQ(VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout));
600         CHKERRQ(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
601         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
602         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
603       }
604       if (bout) {
605         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
606         CHKERRQ(VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout));
607         CHKERRQ(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
608         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
609         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
610       }
611 #else
612       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
613       if (fin) {
614         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
615         CHKERRQ(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fin,fin));
616         CHKERRQ(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
617         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
618         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
619       }
620       if (fout) {
621         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
622         CHKERRQ(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fout,fout));
623         CHKERRQ(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
624         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
625         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
626       }
627       if (bout) {
628         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
629         CHKERRQ(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_bout,bout));
630         CHKERRQ(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
631         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
632         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
633       }
634 #endif
635       break;
636     default:
637 #if !defined(PETSC_USE_COMPLEX)
638       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
639       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
640       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
641       N1          = 2*fft->N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
642       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
643 
644       if (fin) {
645         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
646         CHKERRQ(VecCreateMPIWithArray(comm,1,fft->n,N1,(PetscScalar*)data_finr,fin));
647         CHKERRQ(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
648         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
649         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
650       }
651       if (fout) {
652         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
653         CHKERRQ(VecCreateMPIWithArray(comm,1,fft->n,N1,(PetscScalar*)data_fout,fout));
654         CHKERRQ(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
655         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
656         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
657       }
658       if (bout) {
659         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
660         CHKERRQ(VecCreateMPIWithArray(comm,1,fft->n,N1,(PetscScalar*)data_boutr,bout));
661         CHKERRQ(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
662         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
663         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
664       }
665 #else
666       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
667       if (fin) {
668         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
669         CHKERRQ(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fin,fin));
670         CHKERRQ(PetscObjectCompose((PetscObject)*fin,"FFTmatrix",(PetscObject)A));
671         (*fin)->ops->duplicate = VecDuplicate_FFTW_fin;
672         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
673       }
674       if (fout) {
675         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
676         CHKERRQ(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_fout,fout));
677         CHKERRQ(PetscObjectCompose((PetscObject)*fout,"FFTmatrix",(PetscObject)A));
678         (*fout)->ops->duplicate = VecDuplicate_FFTW_fout;
679         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
680       }
681       if (bout) {
682         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
683         CHKERRQ(VecCreateMPIWithArray(comm,1,fft->n,fft->N,(const PetscScalar*)data_bout,bout));
684         CHKERRQ(PetscObjectCompose((PetscObject)*bout,"FFTmatrix",(PetscObject)A));
685         (*bout)->ops->duplicate = VecDuplicate_FFTW_bout;
686         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
687       }
688 #endif
689       break;
690     }
691     /* fftw vectors have their data array allocated by fftw_malloc, such that v->array=xxx but
692        v->array_allocated=NULL. A regular replacearray call won't free the memory and only causes
693        memory leaks. We void these pointers here to avoid acident uses.
694     */
695     if (fin)  (*fin)->ops->replacearray = NULL;
696     if (fout) (*fout)->ops->replacearray = NULL;
697     if (bout) (*bout)->ops->replacearray = NULL;
698 #endif
699   }
700   PetscFunctionReturn(0);
701 }
702 
703 /*@
704    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
705 
706    Collective on Mat
707 
708    Input Parameters:
709 +  A - FFTW matrix
710 -  x - the PETSc vector
711 
712    Output Parameters:
713 .  y - the FFTW vector
714 
715   Options Database Keys:
716 . -mat_fftw_plannerflags - set FFTW planner flags
717 
718    Level: intermediate
719 
720    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
721          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
722          zeros. This routine does that job by scattering operation.
723 
724 .seealso: VecScatterFFTWToPetsc()
725 @*/
726 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
727 {
728   PetscFunctionBegin;
729   CHKERRQ(PetscUseMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y)));
730   PetscFunctionReturn(0);
731 }
732 
733 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
734 {
735   MPI_Comm       comm;
736   Mat_FFT        *fft  = (Mat_FFT*)A->data;
737   PetscInt       low;
738   PetscMPIInt    rank,size;
739   PetscInt       vsize,vsize1;
740   VecScatter     vecscat;
741   IS             list1;
742 
743   PetscFunctionBegin;
744   CHKERRQ(PetscObjectGetComm((PetscObject)A,&comm));
745   CHKERRMPI(MPI_Comm_size(comm, &size));
746   CHKERRMPI(MPI_Comm_rank(comm, &rank));
747   CHKERRQ(VecGetOwnershipRange(y,&low,NULL));
748 
749   if (size==1) {
750     CHKERRQ(VecGetSize(x,&vsize));
751     CHKERRQ(VecGetSize(y,&vsize1));
752     CHKERRQ(ISCreateStride(PETSC_COMM_SELF,fft->N,0,1,&list1));
753     CHKERRQ(VecScatterCreate(x,list1,y,list1,&vecscat));
754     CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
755     CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
756     CHKERRQ(VecScatterDestroy(&vecscat));
757     CHKERRQ(ISDestroy(&list1));
758 #if !PetscDefined(HAVE_MPIUNI)
759   } else {
760     Mat_FFTW   *fftw = (Mat_FFTW*)fft->data;
761     PetscInt   ndim  = fft->ndim,*dim = fft->dim;
762     ptrdiff_t  local_n0,local_0_start;
763     ptrdiff_t  local_n1,local_1_start;
764     IS         list2;
765 #if !defined(PETSC_USE_COMPLEX)
766     PetscInt   i,j,k,partial_dim;
767     PetscInt   *indx1, *indx2, tempindx, tempindx1;
768     PetscInt   NM;
769     ptrdiff_t  temp;
770 #endif
771 
772     switch (ndim) {
773     case 1:
774 #if defined(PETSC_USE_COMPLEX)
775       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
776 
777       CHKERRQ(ISCreateStride(comm,local_n0,local_0_start,1,&list1));
778       CHKERRQ(ISCreateStride(comm,local_n0,low,1,&list2));
779       CHKERRQ(VecScatterCreate(x,list1,y,list2,&vecscat));
780       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
781       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
782       CHKERRQ(VecScatterDestroy(&vecscat));
783       CHKERRQ(ISDestroy(&list1));
784       CHKERRQ(ISDestroy(&list2));
785 #else
786       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
787 #endif
788       break;
789     case 2:
790 #if defined(PETSC_USE_COMPLEX)
791       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
792 
793       CHKERRQ(ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1));
794       CHKERRQ(ISCreateStride(comm,local_n0*dim[1],low,1,&list2));
795       CHKERRQ(VecScatterCreate(x,list1,y,list2,&vecscat));
796       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
797       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
798       CHKERRQ(VecScatterDestroy(&vecscat));
799       CHKERRQ(ISDestroy(&list1));
800       CHKERRQ(ISDestroy(&list2));
801 #else
802       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
803 
804       CHKERRQ(PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1));
805       CHKERRQ(PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2));
806 
807       if (dim[1]%2==0) {
808         NM = dim[1]+2;
809       } else {
810         NM = dim[1]+1;
811       }
812       for (i=0; i<local_n0; i++) {
813         for (j=0; j<dim[1]; j++) {
814           tempindx  = i*dim[1] + j;
815           tempindx1 = i*NM + j;
816 
817           indx1[tempindx]=local_0_start*dim[1]+tempindx;
818           indx2[tempindx]=low+tempindx1;
819         }
820       }
821 
822       CHKERRQ(ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1));
823       CHKERRQ(ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2));
824 
825       CHKERRQ(VecScatterCreate(x,list1,y,list2,&vecscat));
826       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
827       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
828       CHKERRQ(VecScatterDestroy(&vecscat));
829       CHKERRQ(ISDestroy(&list1));
830       CHKERRQ(ISDestroy(&list2));
831       CHKERRQ(PetscFree(indx1));
832       CHKERRQ(PetscFree(indx2));
833 #endif
834       break;
835 
836     case 3:
837 #if defined(PETSC_USE_COMPLEX)
838       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
839 
840       CHKERRQ(ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1));
841       CHKERRQ(ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2));
842       CHKERRQ(VecScatterCreate(x,list1,y,list2,&vecscat));
843       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
844       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
845       CHKERRQ(VecScatterDestroy(&vecscat));
846       CHKERRQ(ISDestroy(&list1));
847       CHKERRQ(ISDestroy(&list2));
848 #else
849       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
850       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 3D real transform");
851       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
852 
853       CHKERRQ(PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1));
854       CHKERRQ(PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2));
855 
856       if (dim[2]%2==0) NM = dim[2]+2;
857       else             NM = dim[2]+1;
858 
859       for (i=0; i<local_n0; i++) {
860         for (j=0; j<dim[1]; j++) {
861           for (k=0; k<dim[2]; k++) {
862             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
863             tempindx1 = i*dim[1]*NM + j*NM + k;
864 
865             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
866             indx2[tempindx]=low+tempindx1;
867           }
868         }
869       }
870 
871       CHKERRQ(ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1));
872       CHKERRQ(ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2));
873       CHKERRQ(VecScatterCreate(x,list1,y,list2,&vecscat));
874       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
875       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
876       CHKERRQ(VecScatterDestroy(&vecscat));
877       CHKERRQ(ISDestroy(&list1));
878       CHKERRQ(ISDestroy(&list2));
879       CHKERRQ(PetscFree(indx1));
880       CHKERRQ(PetscFree(indx2));
881 #endif
882       break;
883 
884     default:
885 #if defined(PETSC_USE_COMPLEX)
886       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
887 
888       CHKERRQ(ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1));
889       CHKERRQ(ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2));
890       CHKERRQ(VecScatterCreate(x,list1,y,list2,&vecscat));
891       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
892       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
893       CHKERRQ(VecScatterDestroy(&vecscat));
894       CHKERRQ(ISDestroy(&list1));
895       CHKERRQ(ISDestroy(&list2));
896 #else
897       /* buggy, needs to be fixed. See src/mat/tests/ex158.c */
898       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel DIM>3 real transform");
899       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
900 
901       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
902 
903       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
904 
905       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
906 
907       partial_dim = fftw->partial_dim;
908 
909       CHKERRQ(PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1));
910       CHKERRQ(PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2));
911 
912       if (dim[ndim-1]%2==0) NM = 2;
913       else                  NM = 1;
914 
915       j = low;
916       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
917         indx1[i] = local_0_start*partial_dim + i;
918         indx2[i] = j;
919         if (k%dim[ndim-1]==0) j+=NM;
920         j++;
921       }
922       CHKERRQ(ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1));
923       CHKERRQ(ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2));
924       CHKERRQ(VecScatterCreate(x,list1,y,list2,&vecscat));
925       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
926       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
927       CHKERRQ(VecScatterDestroy(&vecscat));
928       CHKERRQ(ISDestroy(&list1));
929       CHKERRQ(ISDestroy(&list2));
930       CHKERRQ(PetscFree(indx1));
931       CHKERRQ(PetscFree(indx2));
932 #endif
933       break;
934     }
935 #endif
936   }
937   PetscFunctionReturn(0);
938 }
939 
940 /*@
941    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
942 
943    Collective on Mat
944 
945     Input Parameters:
946 +   A - FFTW matrix
947 -   x - FFTW vector
948 
949    Output Parameters:
950 .  y - PETSc vector
951 
952    Level: intermediate
953 
954    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
955          VecScatterFFTWToPetsc removes those extra zeros.
956 
957 .seealso: VecScatterPetscToFFTW()
958 @*/
959 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
960 {
961   PetscFunctionBegin;
962   CHKERRQ(PetscUseMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y)));
963   PetscFunctionReturn(0);
964 }
965 
966 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
967 {
968   MPI_Comm       comm;
969   Mat_FFT        *fft  = (Mat_FFT*)A->data;
970   PetscInt       low;
971   PetscMPIInt    rank,size;
972   VecScatter     vecscat;
973   IS             list1;
974 
975   PetscFunctionBegin;
976   CHKERRQ(PetscObjectGetComm((PetscObject)A,&comm));
977   CHKERRMPI(MPI_Comm_size(comm, &size));
978   CHKERRMPI(MPI_Comm_rank(comm, &rank));
979   CHKERRQ(VecGetOwnershipRange(x,&low,NULL));
980 
981   if (size==1) {
982     CHKERRQ(ISCreateStride(comm,fft->N,0,1,&list1));
983     CHKERRQ(VecScatterCreate(x,list1,y,list1,&vecscat));
984     CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
985     CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
986     CHKERRQ(VecScatterDestroy(&vecscat));
987     CHKERRQ(ISDestroy(&list1));
988 
989 #if !PetscDefined(HAVE_MPIUNI)
990   } else {
991     Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
992     PetscInt       ndim  = fft->ndim,*dim = fft->dim;
993     ptrdiff_t      local_n0,local_0_start;
994     ptrdiff_t      local_n1,local_1_start;
995     IS             list2;
996 #if !defined(PETSC_USE_COMPLEX)
997     PetscInt       i,j,k,partial_dim;
998     PetscInt       *indx1, *indx2, tempindx, tempindx1;
999     PetscInt       NM;
1000     ptrdiff_t      temp;
1001 #endif
1002     switch (ndim) {
1003     case 1:
1004 #if defined(PETSC_USE_COMPLEX)
1005       fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1006 
1007       CHKERRQ(ISCreateStride(comm,local_n1,local_1_start,1,&list1));
1008       CHKERRQ(ISCreateStride(comm,local_n1,low,1,&list2));
1009       CHKERRQ(VecScatterCreate(x,list1,y,list2,&vecscat));
1010       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1011       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1012       CHKERRQ(VecScatterDestroy(&vecscat));
1013       CHKERRQ(ISDestroy(&list1));
1014       CHKERRQ(ISDestroy(&list2));
1015 #else
1016       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
1017 #endif
1018       break;
1019     case 2:
1020 #if defined(PETSC_USE_COMPLEX)
1021       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1022 
1023       CHKERRQ(ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1));
1024       CHKERRQ(ISCreateStride(comm,local_n0*dim[1],low,1,&list2));
1025       CHKERRQ(VecScatterCreate(x,list2,y,list1,&vecscat));
1026       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1027       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1028       CHKERRQ(VecScatterDestroy(&vecscat));
1029       CHKERRQ(ISDestroy(&list1));
1030       CHKERRQ(ISDestroy(&list2));
1031 #else
1032       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1033 
1034       CHKERRQ(PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx1));
1035       CHKERRQ(PetscMalloc1(((PetscInt)local_n0)*dim[1],&indx2));
1036 
1037       if (dim[1]%2==0) NM = dim[1]+2;
1038       else             NM = dim[1]+1;
1039 
1040       for (i=0; i<local_n0; i++) {
1041         for (j=0; j<dim[1]; j++) {
1042           tempindx = i*dim[1] + j;
1043           tempindx1 = i*NM + j;
1044 
1045           indx1[tempindx]=local_0_start*dim[1]+tempindx;
1046           indx2[tempindx]=low+tempindx1;
1047         }
1048       }
1049 
1050       CHKERRQ(ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1));
1051       CHKERRQ(ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2));
1052 
1053       CHKERRQ(VecScatterCreate(x,list2,y,list1,&vecscat));
1054       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1055       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1056       CHKERRQ(VecScatterDestroy(&vecscat));
1057       CHKERRQ(ISDestroy(&list1));
1058       CHKERRQ(ISDestroy(&list2));
1059       CHKERRQ(PetscFree(indx1));
1060       CHKERRQ(PetscFree(indx2));
1061 #endif
1062       break;
1063     case 3:
1064 #if defined(PETSC_USE_COMPLEX)
1065       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1066 
1067       CHKERRQ(ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1));
1068       CHKERRQ(ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2));
1069       CHKERRQ(VecScatterCreate(x,list1,y,list2,&vecscat));
1070       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1071       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1072       CHKERRQ(VecScatterDestroy(&vecscat));
1073       CHKERRQ(ISDestroy(&list1));
1074       CHKERRQ(ISDestroy(&list2));
1075 #else
1076       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1077 
1078       CHKERRQ(PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx1));
1079       CHKERRQ(PetscMalloc1(((PetscInt)local_n0)*dim[1]*dim[2],&indx2));
1080 
1081       if (dim[2]%2==0) NM = dim[2]+2;
1082       else             NM = dim[2]+1;
1083 
1084       for (i=0; i<local_n0; i++) {
1085         for (j=0; j<dim[1]; j++) {
1086           for (k=0; k<dim[2]; k++) {
1087             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1088             tempindx1 = i*dim[1]*NM + j*NM + k;
1089 
1090             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1091             indx2[tempindx]=low+tempindx1;
1092           }
1093         }
1094       }
1095 
1096       CHKERRQ(ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1));
1097       CHKERRQ(ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2));
1098 
1099       CHKERRQ(VecScatterCreate(x,list2,y,list1,&vecscat));
1100       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1101       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1102       CHKERRQ(VecScatterDestroy(&vecscat));
1103       CHKERRQ(ISDestroy(&list1));
1104       CHKERRQ(ISDestroy(&list2));
1105       CHKERRQ(PetscFree(indx1));
1106       CHKERRQ(PetscFree(indx2));
1107 #endif
1108       break;
1109     default:
1110 #if defined(PETSC_USE_COMPLEX)
1111       fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
1112 
1113       CHKERRQ(ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1));
1114       CHKERRQ(ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2));
1115       CHKERRQ(VecScatterCreate(x,list1,y,list2,&vecscat));
1116       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1117       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1118       CHKERRQ(VecScatterDestroy(&vecscat));
1119       CHKERRQ(ISDestroy(&list1));
1120       CHKERRQ(ISDestroy(&list2));
1121 #else
1122       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
1123 
1124       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
1125 
1126       fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1127 
1128       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1129 
1130       partial_dim = fftw->partial_dim;
1131 
1132       CHKERRQ(PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx1));
1133       CHKERRQ(PetscMalloc1(((PetscInt)local_n0)*partial_dim,&indx2));
1134 
1135       if (dim[ndim-1]%2==0) NM = 2;
1136       else                  NM = 1;
1137 
1138       j = low;
1139       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1140         indx1[i] = local_0_start*partial_dim + i;
1141         indx2[i] = j;
1142         if (k%dim[ndim-1]==0) j+=NM;
1143         j++;
1144       }
1145       CHKERRQ(ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1));
1146       CHKERRQ(ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2));
1147 
1148       CHKERRQ(VecScatterCreate(x,list2,y,list1,&vecscat));
1149       CHKERRQ(VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1150       CHKERRQ(VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD));
1151       CHKERRQ(VecScatterDestroy(&vecscat));
1152       CHKERRQ(ISDestroy(&list1));
1153       CHKERRQ(ISDestroy(&list2));
1154       CHKERRQ(PetscFree(indx1));
1155       CHKERRQ(PetscFree(indx2));
1156 #endif
1157       break;
1158     }
1159 #endif
1160   }
1161   PetscFunctionReturn(0);
1162 }
1163 
1164 /*
1165     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1166 
1167   Options Database Keys:
1168 + -mat_fftw_plannerflags - set FFTW planner flags
1169 
1170    Level: intermediate
1171 
1172 */
1173 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1174 {
1175   MPI_Comm       comm;
1176   Mat_FFT        *fft = (Mat_FFT*)A->data;
1177   Mat_FFTW       *fftw;
1178   PetscInt       ndim = fft->ndim,*dim = fft->dim;
1179   const char     *plans[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1180   unsigned       iplans[]={FFTW_ESTIMATE,FFTW_MEASURE,FFTW_PATIENT,FFTW_EXHAUSTIVE};
1181   PetscBool      flg;
1182   PetscInt       p_flag,partial_dim=1,ctr;
1183   PetscMPIInt    size,rank;
1184   ptrdiff_t      *pdim;
1185   PetscErrorCode ierr;
1186 #if !defined(PETSC_USE_COMPLEX)
1187   PetscInt       tot_dim;
1188 #endif
1189 
1190   PetscFunctionBegin;
1191   CHKERRQ(PetscObjectGetComm((PetscObject)A,&comm));
1192   CHKERRMPI(MPI_Comm_size(comm, &size));
1193   CHKERRMPI(MPI_Comm_rank(comm, &rank));
1194 
1195 #if !PetscDefined(HAVE_MPIUNI)
1196   fftw_mpi_init();
1197 #endif
1198   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1199   pdim[0] = dim[0];
1200 #if !defined(PETSC_USE_COMPLEX)
1201   tot_dim = 2*dim[0];
1202 #endif
1203   for (ctr=1; ctr<ndim; ctr++) {
1204     partial_dim *= dim[ctr];
1205     pdim[ctr]    = dim[ctr];
1206 #if !defined(PETSC_USE_COMPLEX)
1207     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1208     else             tot_dim *= dim[ctr];
1209 #endif
1210   }
1211 
1212   if (size == 1) {
1213 #if defined(PETSC_USE_COMPLEX)
1214     CHKERRQ(MatSetSizes(A,fft->N,fft->N,fft->N,fft->N));
1215     fft->n = fft->N;
1216 #else
1217     CHKERRQ(MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim));
1218     fft->n = tot_dim;
1219 #endif
1220 #if !PetscDefined(HAVE_MPIUNI)
1221   } else {
1222     ptrdiff_t local_n0,local_0_start,local_n1,local_1_start;
1223 #if !defined(PETSC_USE_COMPLEX)
1224     ptrdiff_t temp;
1225     PetscInt  N1;
1226 #endif
1227 
1228     switch (ndim) {
1229     case 1:
1230 #if !defined(PETSC_USE_COMPLEX)
1231       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1232 #else
1233       fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1234       fft->n = (PetscInt)local_n0;
1235       CHKERRQ(MatSetSizes(A,local_n1,fft->n,fft->N,fft->N));
1236 #endif
1237       break;
1238     case 2:
1239 #if defined(PETSC_USE_COMPLEX)
1240       fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1241       fft->n    = (PetscInt)local_n0*dim[1];
1242       CHKERRQ(MatSetSizes(A,fft->n,fft->n,fft->N,fft->N));
1243 #else
1244       fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1245 
1246       fft->n = 2*(PetscInt)local_n0*(dim[1]/2+1);
1247       CHKERRQ(MatSetSizes(A,fft->n,fft->n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1)));
1248 #endif
1249       break;
1250     case 3:
1251 #if defined(PETSC_USE_COMPLEX)
1252       fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1253 
1254       fft->n = (PetscInt)local_n0*dim[1]*dim[2];
1255       CHKERRQ(MatSetSizes(A,fft->n,fft->n,fft->N,fft->N));
1256 #else
1257       fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1258 
1259       fft->n = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1260       CHKERRQ(MatSetSizes(A,fft->n,fft->n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1)));
1261 #endif
1262       break;
1263     default:
1264 #if defined(PETSC_USE_COMPLEX)
1265       fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1266 
1267       fft->n = (PetscInt)local_n0*partial_dim;
1268       CHKERRQ(MatSetSizes(A,fft->n,fft->n,fft->N,fft->N));
1269 #else
1270       temp = pdim[ndim-1];
1271 
1272       pdim[ndim-1] = temp/2 + 1;
1273 
1274       fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1275 
1276       fft->n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1277       N1 = 2*fft->N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1278 
1279       pdim[ndim-1] = temp;
1280 
1281       CHKERRQ(MatSetSizes(A,fft->n,fft->n,N1,N1));
1282 #endif
1283       break;
1284     }
1285 #endif
1286   }
1287   free(pdim);
1288   CHKERRQ(PetscObjectChangeTypeName((PetscObject)A,MATFFTW));
1289   CHKERRQ(PetscNewLog(A,&fftw));
1290   fft->data = (void*)fftw;
1291 
1292   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1293   fftw->partial_dim = partial_dim;
1294 
1295   CHKERRQ(PetscMalloc1(ndim, &(fftw->dim_fftw)));
1296   if (size == 1) {
1297 #if defined(PETSC_USE_64BIT_INDICES)
1298     fftw->iodims = (fftw_iodim64 *) malloc(sizeof(fftw_iodim64) * ndim);
1299 #else
1300     fftw->iodims = (fftw_iodim *) malloc(sizeof(fftw_iodim) * ndim);
1301 #endif
1302   }
1303 
1304   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1305 
1306   fftw->p_forward  = NULL;
1307   fftw->p_backward = NULL;
1308   fftw->p_flag     = FFTW_ESTIMATE;
1309 
1310   if (size == 1) {
1311     A->ops->mult          = MatMult_SeqFFTW;
1312     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1313 #if !PetscDefined(HAVE_MPIUNI)
1314   } else {
1315     A->ops->mult          = MatMult_MPIFFTW;
1316     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1317 #endif
1318   }
1319   fft->matdestroy = MatDestroy_FFTW;
1320   A->assembled    = PETSC_TRUE;
1321   A->preallocated = PETSC_TRUE;
1322 
1323   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCreateVecsFFTW_C",MatCreateVecsFFTW_FFTW));
1324   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW));
1325   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW));
1326 
1327   /* get runtime options */
1328   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1329   CHKERRQ(PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",plans,4,plans[0],&p_flag,&flg));
1330   if (flg) fftw->p_flag = iplans[p_flag];
1331   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1332   PetscFunctionReturn(0);
1333 }
1334