xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 047240e14af00aad1ef65e96f6fface8924f7f7e)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/examples/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #include <fftw3-mpi.h>
10 EXTERN_C_END
11 
12 typedef struct {
13   ptrdiff_t   ndim_fftw,*dim_fftw;
14   PetscInt    partial_dim;
15   fftw_plan   p_forward,p_backward;
16   unsigned    p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
17   PetscScalar *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
18                                                             executed for the arrays with which the plan was created */
19 } Mat_FFTW;
20 
21 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
22 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
23 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
24 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
25 extern PetscErrorCode MatDestroy_FFTW(Mat);
26 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
27 
28 /* MatMult_SeqFFTW performs forward DFT in parallel
29    Input parameter:
30      A - the matrix
31      x - the vector on which FDFT will be performed
32 
33    Output parameter:
34      y - vector that stores result of FDFT
35 */
36 #undef __FUNCT__
37 #define __FUNCT__ "MatMult_SeqFFTW"
38 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
39 {
40   PetscErrorCode ierr;
41   Mat_FFT        *fft  = (Mat_FFT*)A->data;
42   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
43   PetscScalar    *x_array,*y_array;
44   PetscInt       ndim=fft->ndim,*dim=fft->dim;
45 
46   PetscFunctionBegin;
47   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
48   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
49   if (!fftw->p_forward) { /* create a plan, then excute it */
50     switch (ndim) {
51     case 1:
52 #if defined(PETSC_USE_COMPLEX)
53       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
54 #else
55       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
56 #endif
57       break;
58     case 2:
59 #if defined(PETSC_USE_COMPLEX)
60       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
61 #else
62       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
63 #endif
64       break;
65     case 3:
66 #if defined(PETSC_USE_COMPLEX)
67       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
68 #else
69       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
70 #endif
71       break;
72     default:
73 #if defined(PETSC_USE_COMPLEX)
74       fftw->p_forward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
75 #else
76       fftw->p_forward = fftw_plan_dft_r2c(ndim,dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
77 #endif
78       break;
79     }
80     fftw->finarray  = x_array;
81     fftw->foutarray = y_array;
82     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
83                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
84     fftw_execute(fftw->p_forward);
85   } else { /* use existing plan */
86     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
87       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
88     } else {
89       fftw_execute(fftw->p_forward);
90     }
91   }
92   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
93   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
94   PetscFunctionReturn(0);
95 }
96 
97 /* MatMultTranspose_SeqFFTW performs serial backward DFT
98    Input parameter:
99      A - the matrix
100      x - the vector on which BDFT will be performed
101 
102    Output parameter:
103      y - vector that stores result of BDFT
104 */
105 
106 #undef __FUNCT__
107 #define __FUNCT__ "MatMultTranspose_SeqFFTW"
108 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
109 {
110   PetscErrorCode ierr;
111   Mat_FFT        *fft  = (Mat_FFT*)A->data;
112   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
113   PetscScalar    *x_array,*y_array;
114   PetscInt       ndim=fft->ndim,*dim=fft->dim;
115 
116   PetscFunctionBegin;
117   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
118   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
119   if (!fftw->p_backward) { /* create a plan, then excute it */
120     switch (ndim) {
121     case 1:
122 #if defined(PETSC_USE_COMPLEX)
123       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
124 #else
125       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
126 #endif
127       break;
128     case 2:
129 #if defined(PETSC_USE_COMPLEX)
130       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
131 #else
132       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
133 #endif
134       break;
135     case 3:
136 #if defined(PETSC_USE_COMPLEX)
137       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
138 #else
139       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
140 #endif
141       break;
142     default:
143 #if defined(PETSC_USE_COMPLEX)
144       fftw->p_backward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
145 #else
146       fftw->p_backward= fftw_plan_dft_c2r(ndim,dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
147 #endif
148       break;
149     }
150     fftw->binarray  = x_array;
151     fftw->boutarray = y_array;
152     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
153   } else { /* use existing plan */
154     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
155       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
156     } else {
157       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
158     }
159   }
160   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
161   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
162   PetscFunctionReturn(0);
163 }
164 
165 /* MatMult_MPIFFTW performs forward DFT in parallel
166    Input parameter:
167      A - the matrix
168      x - the vector on which FDFT will be performed
169 
170    Output parameter:
171    y   - vector that stores result of FDFT
172 */
173 #undef __FUNCT__
174 #define __FUNCT__ "MatMult_MPIFFTW"
175 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
176 {
177   PetscErrorCode ierr;
178   Mat_FFT        *fft  = (Mat_FFT*)A->data;
179   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
180   PetscScalar    *x_array,*y_array;
181   PetscInt       ndim=fft->ndim,*dim=fft->dim;
182   MPI_Comm       comm=((PetscObject)A)->comm;
183 
184   PetscFunctionBegin;
185   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
186   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
187   if (!fftw->p_forward) { /* create a plan, then excute it */
188     switch (ndim) {
189     case 1:
190 #if defined(PETSC_USE_COMPLEX)
191       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
192 #else
193       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
194 #endif
195       break;
196     case 2:
197 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
198       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
199 #else
200       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
201 #endif
202       break;
203     case 3:
204 #if defined(PETSC_USE_COMPLEX)
205       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
206 #else
207       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
208 #endif
209       break;
210     default:
211 #if defined(PETSC_USE_COMPLEX)
212       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
213 #else
214       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
215 #endif
216       break;
217     }
218     fftw->finarray  = x_array;
219     fftw->foutarray = y_array;
220     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
221                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
222     fftw_execute(fftw->p_forward);
223   } else { /* use existing plan */
224     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
225       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
226     } else {
227       fftw_execute(fftw->p_forward);
228     }
229   }
230   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
231   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
232   PetscFunctionReturn(0);
233 }
234 
235 /* MatMultTranspose_MPIFFTW performs parallel backward DFT
236    Input parameter:
237      A - the matrix
238      x - the vector on which BDFT will be performed
239 
240    Output parameter:
241      y - vector that stores result of BDFT
242 */
243 #undef __FUNCT__
244 #define __FUNCT__ "MatMultTranspose_MPIFFTW"
245 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
246 {
247   PetscErrorCode ierr;
248   Mat_FFT        *fft  = (Mat_FFT*)A->data;
249   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
250   PetscScalar    *x_array,*y_array;
251   PetscInt       ndim=fft->ndim,*dim=fft->dim;
252   MPI_Comm       comm=((PetscObject)A)->comm;
253 
254   PetscFunctionBegin;
255   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
256   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
257   if (!fftw->p_backward) { /* create a plan, then excute it */
258     switch (ndim) {
259     case 1:
260 #if defined(PETSC_USE_COMPLEX)
261       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
262 #else
263       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
264 #endif
265       break;
266     case 2:
267 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
268       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
269 #else
270       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
271 #endif
272       break;
273     case 3:
274 #if defined(PETSC_USE_COMPLEX)
275       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
276 #else
277       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
278 #endif
279       break;
280     default:
281 #if defined(PETSC_USE_COMPLEX)
282       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
283 #else
284       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
285 #endif
286       break;
287     }
288     fftw->binarray  = x_array;
289     fftw->boutarray = y_array;
290     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
291   } else { /* use existing plan */
292     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
293       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
294     } else {
295       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
296     }
297   }
298   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
299   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
300   PetscFunctionReturn(0);
301 }
302 
303 #undef __FUNCT__
304 #define __FUNCT__ "MatDestroy_FFTW"
305 PetscErrorCode MatDestroy_FFTW(Mat A)
306 {
307   Mat_FFT        *fft  = (Mat_FFT*)A->data;
308   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
309   PetscErrorCode ierr;
310 
311   PetscFunctionBegin;
312   fftw_destroy_plan(fftw->p_forward);
313   fftw_destroy_plan(fftw->p_backward);
314   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
315   ierr = PetscFree(fft->data);CHKERRQ(ierr);
316   fftw_mpi_cleanup();
317   PetscFunctionReturn(0);
318 }
319 
320 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
321 #undef __FUNCT__
322 #define __FUNCT__ "VecDestroy_MPIFFTW"
323 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
324 {
325   PetscErrorCode ierr;
326   PetscScalar    *array;
327 
328   PetscFunctionBegin;
329   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
330   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
331   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
332   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
333   PetscFunctionReturn(0);
334 }
335 
336 #undef __FUNCT__
337 #define __FUNCT__ "MatGetVecsFFTW"
338 /*@
339    MatGetVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
340      parallel layout determined by FFTW
341 
342    Collective on Mat
343 
344    Input Parameter:
345 .   A - the matrix
346 
347    Output Parameter:
348 +   x - (optional) input vector of forward FFTW
349 -   y - (optional) output vector of forward FFTW
350 -   z - (optional) output vector of backward FFTW
351 
352   Level: advanced
353 
354   Note: The parallel layout of output of forward FFTW is always same as the input
355         of the backward FFTW. But parallel layout of the input vector of forward
356         FFTW might not be same as the output of backward FFTW.
357         Also note that we need to provide enough space while doing parallel real transform.
358         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
359         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
360         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
361         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
362         zeros if it is odd only one zero is needed.
363         Lastly one needs some scratch space at the end of data set in each process. alloc_local
364         figures out how much space is needed, i.e. it figures out the data+scratch space for
365         each processor and returns that.
366 
367 .seealso: MatCreateFFTW()
368 @*/
369 PetscErrorCode MatGetVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
370 {
371   PetscErrorCode ierr;
372 
373   PetscFunctionBegin;
374   ierr = PetscTryMethod(A,"MatGetVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
375   PetscFunctionReturn(0);
376 }
377 
378 EXTERN_C_BEGIN
379 #undef __FUNCT__
380 #define __FUNCT__ "MatGetVecsFFTW_FFTW"
381 PetscErrorCode  MatGetVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
382 {
383   PetscErrorCode ierr;
384   PetscMPIInt    size,rank;
385   MPI_Comm       comm  = ((PetscObject)A)->comm;
386   Mat_FFT        *fft  = (Mat_FFT*)A->data;
387   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
388   PetscInt       N     = fft->N;
389   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;
390 
391   PetscFunctionBegin;
392   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
393   PetscValidType(A,1);
394 
395   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
396   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
397   if (size == 1) { /* sequential case */
398 #if defined(PETSC_USE_COMPLEX)
399     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
400     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
401     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
402 #else
403     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
404     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
405     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
406 #endif
407   } else { /* parallel cases */
408     ptrdiff_t    alloc_local,local_n0,local_0_start;
409     ptrdiff_t    local_n1;
410     fftw_complex *data_fout;
411     ptrdiff_t    local_1_start;
412 #if defined(PETSC_USE_COMPLEX)
413     fftw_complex *data_fin,*data_bout;
414 #else
415     double    *data_finr,*data_boutr;
416     PetscInt  n1,N1;
417     ptrdiff_t temp;
418 #endif
419 
420     switch (ndim) {
421     case 1:
422 #if !defined(PETSC_USE_COMPLEX)
423       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
424 #else
425       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
426       if (fin) {
427         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
428         ierr      = VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
429 
430         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
431       }
432       if (fout) {
433         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
434         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
435 
436         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
437       }
438       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
439       if (bout) {
440         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
441         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
442 
443         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
444       }
445       break;
446 #endif
447     case 2:
448 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
449       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
450       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
451       if (fin) {
452         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
453         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
454 
455         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
456       }
457       if (bout) {
458         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
459         ierr       = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
460 
461         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
462       }
463       if (fout) {
464         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
465         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
466 
467         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
468       }
469 #else
470       /* Get local size */
471       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
472       if (fin) {
473         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
474         ierr     = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
475 
476         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
477       }
478       if (fout) {
479         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
480         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
481 
482         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
483       }
484       if (bout) {
485         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
486         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
487 
488         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
489       }
490 #endif
491       break;
492     case 3:
493 #if !defined(PETSC_USE_COMPLEX)
494       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
495       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
496       if (fin) {
497         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
498         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
499 
500         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
501       }
502       if (bout) {
503         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
504         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
505 
506         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
507       }
508 
509       if (fout) {
510         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
511         ierr = VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
512 
513         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
514       }
515 #else
516       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
517       if (fin) {
518         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
519         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
520 
521         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
522       }
523       if (fout) {
524         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
525         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
526 
527         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
528       }
529       if (bout) {
530         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
531         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
532 
533         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
534       }
535 #endif
536       break;
537     default:
538 #if !defined(PETSC_USE_COMPLEX)
539       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
540 
541       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
542 
543       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
544       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
545 
546       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
547 
548       if (fin) {
549         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
550         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
551 
552         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
553       }
554       if (bout) {
555         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
556         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
557 
558         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
559       }
560 
561       if (fout) {
562         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
563         ierr = VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
564 
565         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
566       }
567 #else
568       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
569       if (fin) {
570         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
571         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
572 
573         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
574       }
575       if (fout) {
576         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
577         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
578 
579         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
580       }
581       if (bout) {
582         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
583         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
584 
585         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
586       }
587 #endif
588       break;
589     }
590   }
591   PetscFunctionReturn(0);
592 }
593 EXTERN_C_END
594 
595 #undef __FUNCT__
596 #define __FUNCT__ "VecScatterPetscToFFTW"
597 /*@
598    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
599 
600    Collective on Mat
601 
602    Input Parameters:
603 +  A - FFTW matrix
604 -  x - the PETSc vector
605 
606    Output Parameters:
607 .  y - the FFTW vector
608 
609   Options Database Keys:
610 . -mat_fftw_plannerflags - set FFTW planner flags
611 
612    Level: intermediate
613 
614    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
615          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
616          zeros. This routine does that job by scattering operation.
617 
618 .seealso: VecScatterFFTWToPetsc()
619 @*/
620 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
621 {
622   PetscErrorCode ierr;
623 
624   PetscFunctionBegin;
625   ierr = PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
626   PetscFunctionReturn(0);
627 }
628 
629 EXTERN_C_BEGIN
630 #undef __FUNCT__
631 #define __FUNCT__ "VecScatterPetscToFFTW_FFTW"
632 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
633 {
634   PetscErrorCode ierr;
635   MPI_Comm       comm  =((PetscObject)A)->comm;
636   Mat_FFT        *fft  = (Mat_FFT*)A->data;
637   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
638   PetscInt       N     =fft->N;
639   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
640   PetscInt       low;
641   PetscInt       rank,size,vsize,vsize1;
642   ptrdiff_t      alloc_local,local_n0,local_0_start;
643   ptrdiff_t      local_n1,local_1_start;
644   VecScatter     vecscat;
645   IS             list1,list2;
646 #if !defined(PETSC_USE_COMPLEX)
647   PetscInt  i,j,k,partial_dim;
648   PetscInt  *indx1, *indx2, tempindx, tempindx1;
649   PetscInt  N1, n1,NM;
650   ptrdiff_t temp;
651 #endif
652 
653   PetscFunctionBegin;
654   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
655   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
656   ierr = VecGetOwnershipRange(y,&low,PETSC_NULL);
657 
658   if (size==1) {
659     ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
660     ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
661     ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
662     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
663     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
664     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
665     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
666     ierr = ISDestroy(&list1);CHKERRQ(ierr);
667   } else {
668     switch (ndim) {
669     case 1:
670 #if defined(PETSC_USE_COMPLEX)
671       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
672 
673       ierr = ISCreateStride(comm,local_n0,local_0_start,1,&list1);
674       ierr = ISCreateStride(comm,local_n0,low,1,&list2);
675       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
676       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
677       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
678       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
679       ierr = ISDestroy(&list1);CHKERRQ(ierr);
680       ierr = ISDestroy(&list2);CHKERRQ(ierr);
681 #else
682       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
683 #endif
684       break;
685     case 2:
686 #if defined(PETSC_USE_COMPLEX)
687       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
688 
689       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
690       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
691       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
692       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
693       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
694       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
695       ierr = ISDestroy(&list1);CHKERRQ(ierr);
696       ierr = ISDestroy(&list2);CHKERRQ(ierr);
697 #else
698       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
699 
700       N1   = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
701       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
702       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
703 
704       if (dim[1]%2==0) {
705         NM = dim[1]+2;
706       } else {
707         NM = dim[1]+1;
708       }
709       for (i=0; i<local_n0; i++) {
710         for (j=0; j<dim[1]; j++) {
711           tempindx  = i*dim[1] + j;
712           tempindx1 = i*NM + j;
713 
714           indx1[tempindx]=local_0_start*dim[1]+tempindx;
715           indx2[tempindx]=low+tempindx1;
716         }
717       }
718 
719       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
720       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
721 
722       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
723       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
724       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
725       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
726       ierr = ISDestroy(&list1);CHKERRQ(ierr);
727       ierr = ISDestroy(&list2);CHKERRQ(ierr);
728       ierr = PetscFree(indx1);CHKERRQ(ierr);
729       ierr = PetscFree(indx2);CHKERRQ(ierr);
730 #endif
731       break;
732 
733     case 3:
734 #if defined(PETSC_USE_COMPLEX)
735       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
736 
737       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
738       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
739       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
740       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
741       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
742       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
743       ierr = ISDestroy(&list1);CHKERRQ(ierr);
744       ierr = ISDestroy(&list2);CHKERRQ(ierr);
745 #else
746       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
747 
748       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
749 
750       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
751       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
752 
753       if (dim[2]%2==0) NM = dim[2]+2;
754       else             NM = dim[2]+1;
755 
756       for (i=0; i<local_n0; i++) {
757         for (j=0; j<dim[1]; j++) {
758           for (k=0; k<dim[2]; k++) {
759             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
760             tempindx1 = i*dim[1]*NM + j*NM + k;
761 
762             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
763             indx2[tempindx]=low+tempindx1;
764           }
765         }
766       }
767 
768       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
769       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
770       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
771       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
772       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
773       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
774       ierr = ISDestroy(&list1);CHKERRQ(ierr);
775       ierr = ISDestroy(&list2);CHKERRQ(ierr);
776       ierr = PetscFree(indx1);CHKERRQ(ierr);
777       ierr = PetscFree(indx2);CHKERRQ(ierr);
778 #endif
779       break;
780 
781     default:
782 #if defined(PETSC_USE_COMPLEX)
783       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
784 
785       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
786       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
787       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
788       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
789       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
790       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
791       ierr = ISDestroy(&list1);CHKERRQ(ierr);
792       ierr = ISDestroy(&list2);CHKERRQ(ierr);
793 #else
794       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
795 
796       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
797 
798       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
799 
800       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
801 
802       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
803 
804       partial_dim = fftw->partial_dim;
805 
806       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
807       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
808 
809       if (dim[ndim-1]%2==0) NM = 2;
810       else                  NM = 1;
811 
812       j = low;
813       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
814         indx1[i] = local_0_start*partial_dim + i;
815         indx2[i] = j;
816         if (k%dim[ndim-1]==0) j+=NM;
817         j++;
818       }
819       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
820       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
821       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
822       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
823       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
824       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
825       ierr = ISDestroy(&list1);CHKERRQ(ierr);
826       ierr = ISDestroy(&list2);CHKERRQ(ierr);
827       ierr = PetscFree(indx1);CHKERRQ(ierr);
828       ierr = PetscFree(indx2);CHKERRQ(ierr);
829 #endif
830       break;
831     }
832   }
833   PetscFunctionReturn(0);
834 }
835 EXTERN_C_END
836 
837 #undef __FUNCT__
838 #define __FUNCT__ "VecScatterFFTWToPetsc"
839 /*@
840    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
841 
842    Collective on Mat
843 
844     Input Parameters:
845 +   A - FFTW matrix
846 -   x - FFTW vector
847 
848    Output Parameters:
849 .  y - PETSc vector
850 
851    Level: intermediate
852 
853    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
854          VecScatterFFTWToPetsc removes those extra zeros.
855 
856 .seealso: VecScatterPetscToFFTW()
857 @*/
858 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
859 {
860   PetscErrorCode ierr;
861 
862   PetscFunctionBegin;
863   ierr = PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 
867 EXTERN_C_BEGIN
868 #undef __FUNCT__
869 #define __FUNCT__ "VecScatterFFTWToPetsc_FFTW"
870 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
871 {
872   PetscErrorCode ierr;
873   MPI_Comm       comm  = ((PetscObject)A)->comm;
874   Mat_FFT        *fft  = (Mat_FFT*)A->data;
875   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
876   PetscInt       N     = fft->N;
877   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
878   PetscInt       low;
879   PetscInt       rank,size;
880   ptrdiff_t      alloc_local,local_n0,local_0_start;
881   ptrdiff_t      local_n1,local_1_start;
882   VecScatter     vecscat;
883   IS             list1,list2;
884 #if !defined(PETSC_USE_COMPLEX)
885   PetscInt  i,j,k,partial_dim;
886   PetscInt  *indx1, *indx2, tempindx, tempindx1;
887   PetscInt  N1, n1,NM;
888   ptrdiff_t temp;
889 #endif
890 
891   PetscFunctionBegin;
892   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
893   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
894   ierr = VecGetOwnershipRange(x,&low,PETSC_NULL);CHKERRQ(ierr);
895 
896   if (size==1) {
897     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
898     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
899     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
900     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
901     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
902     ierr = ISDestroy(&list1);CHKERRQ(ierr);
903 
904   } else {
905     switch (ndim) {
906     case 1:
907 #if defined(PETSC_USE_COMPLEX)
908       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
909 
910       ierr = ISCreateStride(comm,local_n1,local_1_start,1,&list1);
911       ierr = ISCreateStride(comm,local_n1,low,1,&list2);
912       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
913       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
914       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
915       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
916       ierr = ISDestroy(&list1);CHKERRQ(ierr);
917       ierr = ISDestroy(&list2);CHKERRQ(ierr);
918 #else
919       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
920 #endif
921       break;
922     case 2:
923 #if defined(PETSC_USE_COMPLEX)
924       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
925 
926       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
927       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
928       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
929       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
930       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
931       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
932       ierr = ISDestroy(&list1);CHKERRQ(ierr);
933       ierr = ISDestroy(&list2);CHKERRQ(ierr);
934 #else
935       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
936 
937       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
938 
939       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
940       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
941 
942       if (dim[1]%2==0) NM = dim[1]+2;
943       else             NM = dim[1]+1;
944 
945       for (i=0; i<local_n0; i++) {
946         for (j=0; j<dim[1]; j++) {
947           tempindx = i*dim[1] + j;
948           tempindx1 = i*NM + j;
949 
950           indx1[tempindx]=local_0_start*dim[1]+tempindx;
951           indx2[tempindx]=low+tempindx1;
952         }
953       }
954 
955       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
956       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
957 
958       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
959       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
960       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
961       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
962       ierr = ISDestroy(&list1);CHKERRQ(ierr);
963       ierr = ISDestroy(&list2);CHKERRQ(ierr);
964       ierr = PetscFree(indx1);CHKERRQ(ierr);
965       ierr = PetscFree(indx2);CHKERRQ(ierr);
966 #endif
967       break;
968     case 3:
969 #if defined(PETSC_USE_COMPLEX)
970       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
971 
972       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
973       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
974       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
975       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
976       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
977       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
978       ierr = ISDestroy(&list1);CHKERRQ(ierr);
979       ierr = ISDestroy(&list2);CHKERRQ(ierr);
980 #else
981 
982       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
983 
984       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
985 
986       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
987       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
988 
989       if (dim[2]%2==0) NM = dim[2]+2;
990       else             NM = dim[2]+1;
991 
992       for (i=0; i<local_n0; i++) {
993         for (j=0; j<dim[1]; j++) {
994           for (k=0; k<dim[2]; k++) {
995             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
996             tempindx1 = i*dim[1]*NM + j*NM + k;
997 
998             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
999             indx2[tempindx]=low+tempindx1;
1000           }
1001         }
1002       }
1003 
1004       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1005       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1006 
1007       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1008       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1009       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1010       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1011       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1012       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1013       ierr = PetscFree(indx1);CHKERRQ(ierr);
1014       ierr = PetscFree(indx2);CHKERRQ(ierr);
1015 #endif
1016       break;
1017     default:
1018 #if defined(PETSC_USE_COMPLEX)
1019       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
1020 
1021       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1022       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1023       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1024       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1025       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1026       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1027       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1028       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1029 #else
1030       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
1031 
1032       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
1033 
1034       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1035 
1036       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
1037 
1038       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1039 
1040       partial_dim = fftw->partial_dim;
1041 
1042       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
1043       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
1044 
1045       if (dim[ndim-1]%2==0) NM = 2;
1046       else                  NM = 1;
1047 
1048       j = low;
1049       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1050         indx1[i] = local_0_start*partial_dim + i;
1051         indx2[i] = j;
1052         if (k%dim[ndim-1]==0) j+=NM;
1053         j++;
1054       }
1055       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1056       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1057 
1058       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1059       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1060       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1061       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1062       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1063       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1064       ierr = PetscFree(indx1);CHKERRQ(ierr);
1065       ierr = PetscFree(indx2);CHKERRQ(ierr);
1066 #endif
1067       break;
1068     }
1069   }
1070   PetscFunctionReturn(0);
1071 }
1072 EXTERN_C_END
1073 
1074 EXTERN_C_BEGIN
1075 #undef __FUNCT__
1076 #define __FUNCT__ "MatCreate_FFTW"
1077 /*
1078     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1079 
1080   Options Database Keys:
1081 + -mat_fftw_plannerflags - set FFTW planner flags
1082 
1083    Level: intermediate
1084 
1085 */
1086 PetscErrorCode MatCreate_FFTW(Mat A)
1087 {
1088   PetscErrorCode ierr;
1089   MPI_Comm       comm=((PetscObject)A)->comm;
1090   Mat_FFT        *fft=(Mat_FFT*)A->data;
1091   Mat_FFTW       *fftw;
1092   PetscInt       n         =fft->n,N=fft->N,ndim=fft->ndim,*dim = fft->dim;
1093   const char     *p_flags[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1094   PetscBool      flg;
1095   PetscInt       p_flag,partial_dim=1,ctr;
1096   PetscMPIInt    size,rank;
1097   ptrdiff_t      *pdim;
1098   ptrdiff_t      local_n1,local_1_start;
1099 #if !defined(PETSC_USE_COMPLEX)
1100   ptrdiff_t temp;
1101   PetscInt  N1,tot_dim;
1102 #else
1103   PetscInt n1;
1104 #endif
1105 
1106   PetscFunctionBegin;
1107   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
1108   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1109 
1110   fftw_mpi_init();
1111   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1112   pdim[0] = dim[0];
1113 #if !defined(PETSC_USE_COMPLEX)
1114   tot_dim = 2*dim[0];
1115 #endif
1116   for (ctr=1; ctr<ndim; ctr++) {
1117     partial_dim *= dim[ctr];
1118     pdim[ctr]    = dim[ctr];
1119 #if !defined(PETSC_USE_COMPLEX)
1120     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1121     else             tot_dim *= dim[ctr];
1122 #endif
1123   }
1124 
1125   if (size == 1) {
1126 #if defined(PETSC_USE_COMPLEX)
1127     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1128     n    = N;
1129 #else
1130     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1131     n    = tot_dim;
1132 #endif
1133 
1134   } else {
1135     ptrdiff_t alloc_local,local_n0,local_0_start;
1136     switch (ndim) {
1137     case 1:
1138 #if !defined(PETSC_USE_COMPLEX)
1139       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1140 #else
1141       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1142 
1143       n    = (PetscInt)local_n0;
1144       n1   = (PetscInt)local_n1;
1145       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1146 #endif
1147       break;
1148     case 2:
1149 #if defined(PETSC_USE_COMPLEX)
1150       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1151       /*
1152        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1153        PetscSynchronizedFlush(comm);
1154        */
1155       n    = (PetscInt)local_n0*dim[1];
1156       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1157 #else
1158       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1159 
1160       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1161       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1162 #endif
1163       break;
1164     case 3:
1165 #if defined(PETSC_USE_COMPLEX)
1166       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1167 
1168       n    = (PetscInt)local_n0*dim[1]*dim[2];
1169       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1170 #else
1171       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1172 
1173       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1174       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1175 #endif
1176       break;
1177     default:
1178 #if defined(PETSC_USE_COMPLEX)
1179       alloc_local = fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1180 
1181       n    = (PetscInt)local_n0*partial_dim;
1182       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1183 #else
1184       temp = pdim[ndim-1];
1185 
1186       pdim[ndim-1] = temp/2 + 1;
1187 
1188       alloc_local = fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1189 
1190       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1191       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1192 
1193       pdim[ndim-1] = temp;
1194 
1195       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1196 #endif
1197       break;
1198     }
1199   }
1200   ierr      = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1201   ierr      = PetscNewLog(A,Mat_FFTW,&fftw);CHKERRQ(ierr);
1202   fft->data = (void*)fftw;
1203 
1204   fft->n            = n;
1205   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1206   fftw->partial_dim = partial_dim;
1207 
1208   ierr = PetscMalloc(ndim*sizeof(ptrdiff_t), (ptrdiff_t*)&(fftw->dim_fftw));CHKERRQ(ierr);
1209 
1210   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1211 
1212   fftw->p_forward  = 0;
1213   fftw->p_backward = 0;
1214   fftw->p_flag     = FFTW_ESTIMATE;
1215 
1216   if (size == 1) {
1217     A->ops->mult          = MatMult_SeqFFTW;
1218     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1219   } else {
1220     A->ops->mult          = MatMult_MPIFFTW;
1221     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1222   }
1223   fft->matdestroy = MatDestroy_FFTW;
1224   A->assembled    = PETSC_TRUE;
1225   A->preallocated = PETSC_TRUE;
1226 
1227   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatGetVecsFFTW_C","MatGetVecsFFTW_FFTW",MatGetVecsFFTW_FFTW);CHKERRQ(ierr);
1228   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterPetscToFFTW_C","VecScatterPetscToFFTW_FFTW",VecScatterPetscToFFTW_FFTW);CHKERRQ(ierr);
1229   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterFFTWToPetsc_C","VecScatterFFTWToPetsc_FFTW",VecScatterFFTWToPetsc_FFTW);CHKERRQ(ierr);
1230 
1231   /* get runtime options */
1232   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1233   ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",p_flags,4,p_flags[0],&p_flag,&flg);CHKERRQ(ierr);
1234   if (flg) fftw->p_flag = (unsigned)p_flag;
1235   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1236   PetscFunctionReturn(0);
1237 }
1238 EXTERN_C_END
1239 
1240 
1241 
1242 
1243