xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision dcca6d9d80ebd869fe6029bd05a3aa9faafef49e)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/examples/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #include <fftw3-mpi.h>
10 EXTERN_C_END
11 
12 typedef struct {
13   ptrdiff_t   ndim_fftw,*dim_fftw;
14   PetscInt    partial_dim;
15   fftw_plan   p_forward,p_backward;
16   unsigned    p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
17   PetscScalar *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
18                                                             executed for the arrays with which the plan was created */
19 } Mat_FFTW;
20 
21 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
22 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
23 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
24 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
25 extern PetscErrorCode MatDestroy_FFTW(Mat);
26 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
27 
28 /* MatMult_SeqFFTW performs forward DFT in parallel
29    Input parameter:
30      A - the matrix
31      x - the vector on which FDFT will be performed
32 
33    Output parameter:
34      y - vector that stores result of FDFT
35 */
36 #undef __FUNCT__
37 #define __FUNCT__ "MatMult_SeqFFTW"
38 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
39 {
40   PetscErrorCode ierr;
41   Mat_FFT        *fft  = (Mat_FFT*)A->data;
42   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
43   PetscScalar    *x_array,*y_array;
44   PetscInt       ndim=fft->ndim,*dim=fft->dim;
45 
46   PetscFunctionBegin;
47   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
48   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
49   if (!fftw->p_forward) { /* create a plan, then excute it */
50     switch (ndim) {
51     case 1:
52 #if defined(PETSC_USE_COMPLEX)
53       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
54 #else
55       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
56 #endif
57       break;
58     case 2:
59 #if defined(PETSC_USE_COMPLEX)
60       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
61 #else
62       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
63 #endif
64       break;
65     case 3:
66 #if defined(PETSC_USE_COMPLEX)
67       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
68 #else
69       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
70 #endif
71       break;
72     default:
73 #if defined(PETSC_USE_COMPLEX)
74       fftw->p_forward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
75 #else
76       fftw->p_forward = fftw_plan_dft_r2c(ndim,dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
77 #endif
78       break;
79     }
80     fftw->finarray  = x_array;
81     fftw->foutarray = y_array;
82     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
83                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
84     fftw_execute(fftw->p_forward);
85   } else { /* use existing plan */
86     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
87       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
88     } else {
89       fftw_execute(fftw->p_forward);
90     }
91   }
92   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
93   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
94   PetscFunctionReturn(0);
95 }
96 
97 /* MatMultTranspose_SeqFFTW performs serial backward DFT
98    Input parameter:
99      A - the matrix
100      x - the vector on which BDFT will be performed
101 
102    Output parameter:
103      y - vector that stores result of BDFT
104 */
105 
106 #undef __FUNCT__
107 #define __FUNCT__ "MatMultTranspose_SeqFFTW"
108 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
109 {
110   PetscErrorCode ierr;
111   Mat_FFT        *fft  = (Mat_FFT*)A->data;
112   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
113   PetscScalar    *x_array,*y_array;
114   PetscInt       ndim=fft->ndim,*dim=fft->dim;
115 
116   PetscFunctionBegin;
117   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
118   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
119   if (!fftw->p_backward) { /* create a plan, then excute it */
120     switch (ndim) {
121     case 1:
122 #if defined(PETSC_USE_COMPLEX)
123       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
124 #else
125       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
126 #endif
127       break;
128     case 2:
129 #if defined(PETSC_USE_COMPLEX)
130       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
131 #else
132       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
133 #endif
134       break;
135     case 3:
136 #if defined(PETSC_USE_COMPLEX)
137       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
138 #else
139       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
140 #endif
141       break;
142     default:
143 #if defined(PETSC_USE_COMPLEX)
144       fftw->p_backward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
145 #else
146       fftw->p_backward= fftw_plan_dft_c2r(ndim,dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
147 #endif
148       break;
149     }
150     fftw->binarray  = x_array;
151     fftw->boutarray = y_array;
152     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
153   } else { /* use existing plan */
154     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
155       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
156     } else {
157       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
158     }
159   }
160   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
161   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
162   PetscFunctionReturn(0);
163 }
164 
165 /* MatMult_MPIFFTW performs forward DFT in parallel
166    Input parameter:
167      A - the matrix
168      x - the vector on which FDFT will be performed
169 
170    Output parameter:
171    y   - vector that stores result of FDFT
172 */
173 #undef __FUNCT__
174 #define __FUNCT__ "MatMult_MPIFFTW"
175 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
176 {
177   PetscErrorCode ierr;
178   Mat_FFT        *fft  = (Mat_FFT*)A->data;
179   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
180   PetscScalar    *x_array,*y_array;
181   PetscInt       ndim=fft->ndim,*dim=fft->dim;
182   MPI_Comm       comm;
183 
184   PetscFunctionBegin;
185   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
186   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
187   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
188   if (!fftw->p_forward) { /* create a plan, then excute it */
189     switch (ndim) {
190     case 1:
191 #if defined(PETSC_USE_COMPLEX)
192       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
193 #else
194       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
195 #endif
196       break;
197     case 2:
198 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
199       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
200 #else
201       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
202 #endif
203       break;
204     case 3:
205 #if defined(PETSC_USE_COMPLEX)
206       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
207 #else
208       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
209 #endif
210       break;
211     default:
212 #if defined(PETSC_USE_COMPLEX)
213       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
214 #else
215       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
216 #endif
217       break;
218     }
219     fftw->finarray  = x_array;
220     fftw->foutarray = y_array;
221     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
222                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
223     fftw_execute(fftw->p_forward);
224   } else { /* use existing plan */
225     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
226       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
227     } else {
228       fftw_execute(fftw->p_forward);
229     }
230   }
231   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
232   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
233   PetscFunctionReturn(0);
234 }
235 
236 /* MatMultTranspose_MPIFFTW performs parallel backward DFT
237    Input parameter:
238      A - the matrix
239      x - the vector on which BDFT will be performed
240 
241    Output parameter:
242      y - vector that stores result of BDFT
243 */
244 #undef __FUNCT__
245 #define __FUNCT__ "MatMultTranspose_MPIFFTW"
246 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
247 {
248   PetscErrorCode ierr;
249   Mat_FFT        *fft  = (Mat_FFT*)A->data;
250   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
251   PetscScalar    *x_array,*y_array;
252   PetscInt       ndim=fft->ndim,*dim=fft->dim;
253   MPI_Comm       comm;
254 
255   PetscFunctionBegin;
256   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
257   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
258   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
259   if (!fftw->p_backward) { /* create a plan, then excute it */
260     switch (ndim) {
261     case 1:
262 #if defined(PETSC_USE_COMPLEX)
263       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
264 #else
265       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
266 #endif
267       break;
268     case 2:
269 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
270       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
271 #else
272       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
273 #endif
274       break;
275     case 3:
276 #if defined(PETSC_USE_COMPLEX)
277       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
278 #else
279       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
280 #endif
281       break;
282     default:
283 #if defined(PETSC_USE_COMPLEX)
284       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
285 #else
286       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
287 #endif
288       break;
289     }
290     fftw->binarray  = x_array;
291     fftw->boutarray = y_array;
292     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
293   } else { /* use existing plan */
294     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
295       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
296     } else {
297       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
298     }
299   }
300   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
301   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
302   PetscFunctionReturn(0);
303 }
304 
305 #undef __FUNCT__
306 #define __FUNCT__ "MatDestroy_FFTW"
307 PetscErrorCode MatDestroy_FFTW(Mat A)
308 {
309   Mat_FFT        *fft  = (Mat_FFT*)A->data;
310   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
311   PetscErrorCode ierr;
312 
313   PetscFunctionBegin;
314   fftw_destroy_plan(fftw->p_forward);
315   fftw_destroy_plan(fftw->p_backward);
316   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
317   ierr = PetscFree(fft->data);CHKERRQ(ierr);
318   fftw_mpi_cleanup();
319   PetscFunctionReturn(0);
320 }
321 
322 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
323 #undef __FUNCT__
324 #define __FUNCT__ "VecDestroy_MPIFFTW"
325 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
326 {
327   PetscErrorCode ierr;
328   PetscScalar    *array;
329 
330   PetscFunctionBegin;
331   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
332   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
333   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
334   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
335   PetscFunctionReturn(0);
336 }
337 
338 #undef __FUNCT__
339 #define __FUNCT__ "MatGetVecsFFTW"
340 /*@
341    MatGetVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
342      parallel layout determined by FFTW
343 
344    Collective on Mat
345 
346    Input Parameter:
347 .   A - the matrix
348 
349    Output Parameter:
350 +   x - (optional) input vector of forward FFTW
351 -   y - (optional) output vector of forward FFTW
352 -   z - (optional) output vector of backward FFTW
353 
354   Level: advanced
355 
356   Note: The parallel layout of output of forward FFTW is always same as the input
357         of the backward FFTW. But parallel layout of the input vector of forward
358         FFTW might not be same as the output of backward FFTW.
359         Also note that we need to provide enough space while doing parallel real transform.
360         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
361         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
362         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
363         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
364         zeros if it is odd only one zero is needed.
365         Lastly one needs some scratch space at the end of data set in each process. alloc_local
366         figures out how much space is needed, i.e. it figures out the data+scratch space for
367         each processor and returns that.
368 
369 .seealso: MatCreateFFTW()
370 @*/
371 PetscErrorCode MatGetVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
372 {
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   ierr = PetscTryMethod(A,"MatGetVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 #undef __FUNCT__
381 #define __FUNCT__ "MatGetVecsFFTW_FFTW"
382 PetscErrorCode  MatGetVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
383 {
384   PetscErrorCode ierr;
385   PetscMPIInt    size,rank;
386   MPI_Comm       comm;
387   Mat_FFT        *fft  = (Mat_FFT*)A->data;
388   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
389   PetscInt       N     = fft->N;
390   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;
391 
392   PetscFunctionBegin;
393   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
394   PetscValidType(A,1);
395   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
396 
397   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
398   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
399   if (size == 1) { /* sequential case */
400 #if defined(PETSC_USE_COMPLEX)
401     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
402     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
403     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
404 #else
405     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
406     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
407     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
408 #endif
409   } else { /* parallel cases */
410     ptrdiff_t    alloc_local,local_n0,local_0_start;
411     ptrdiff_t    local_n1;
412     fftw_complex *data_fout;
413     ptrdiff_t    local_1_start;
414 #if defined(PETSC_USE_COMPLEX)
415     fftw_complex *data_fin,*data_bout;
416 #else
417     double    *data_finr,*data_boutr;
418     PetscInt  n1,N1;
419     ptrdiff_t temp;
420 #endif
421 
422     switch (ndim) {
423     case 1:
424 #if !defined(PETSC_USE_COMPLEX)
425       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
426 #else
427       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
428       if (fin) {
429         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
430         ierr      = VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
431 
432         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
433       }
434       if (fout) {
435         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
436         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
437 
438         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
439       }
440       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
441       if (bout) {
442         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
443         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
444 
445         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
446       }
447       break;
448 #endif
449     case 2:
450 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
451       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
452       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
453       if (fin) {
454         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
455         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
456 
457         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
458       }
459       if (bout) {
460         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
461         ierr       = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
462 
463         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
464       }
465       if (fout) {
466         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
467         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
468 
469         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
470       }
471 #else
472       /* Get local size */
473       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
474       if (fin) {
475         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
476         ierr     = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
477 
478         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
479       }
480       if (fout) {
481         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
482         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
483 
484         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
485       }
486       if (bout) {
487         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
488         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
489 
490         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
491       }
492 #endif
493       break;
494     case 3:
495 #if !defined(PETSC_USE_COMPLEX)
496       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
497       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
498       if (fin) {
499         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
500         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
501 
502         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
503       }
504       if (bout) {
505         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
506         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
507 
508         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
509       }
510 
511       if (fout) {
512         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
513         ierr = VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
514 
515         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
516       }
517 #else
518       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
519       if (fin) {
520         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
521         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
522 
523         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
524       }
525       if (fout) {
526         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
527         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
528 
529         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
530       }
531       if (bout) {
532         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
533         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
534 
535         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
536       }
537 #endif
538       break;
539     default:
540 #if !defined(PETSC_USE_COMPLEX)
541       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
542 
543       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
544 
545       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
546       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
547 
548       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
549 
550       if (fin) {
551         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
552         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
553 
554         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
555       }
556       if (bout) {
557         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
558         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
559 
560         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
561       }
562 
563       if (fout) {
564         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
565         ierr = VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
566 
567         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
568       }
569 #else
570       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
571       if (fin) {
572         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
573         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
574 
575         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
576       }
577       if (fout) {
578         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
579         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
580 
581         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
582       }
583       if (bout) {
584         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
585         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
586 
587         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
588       }
589 #endif
590       break;
591     }
592   }
593   PetscFunctionReturn(0);
594 }
595 
596 #undef __FUNCT__
597 #define __FUNCT__ "VecScatterPetscToFFTW"
598 /*@
599    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
600 
601    Collective on Mat
602 
603    Input Parameters:
604 +  A - FFTW matrix
605 -  x - the PETSc vector
606 
607    Output Parameters:
608 .  y - the FFTW vector
609 
610   Options Database Keys:
611 . -mat_fftw_plannerflags - set FFTW planner flags
612 
613    Level: intermediate
614 
615    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
616          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
617          zeros. This routine does that job by scattering operation.
618 
619 .seealso: VecScatterFFTWToPetsc()
620 @*/
621 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
622 {
623   PetscErrorCode ierr;
624 
625   PetscFunctionBegin;
626   ierr = PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
627   PetscFunctionReturn(0);
628 }
629 
630 #undef __FUNCT__
631 #define __FUNCT__ "VecScatterPetscToFFTW_FFTW"
632 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
633 {
634   PetscErrorCode ierr;
635   MPI_Comm       comm;
636   Mat_FFT        *fft  = (Mat_FFT*)A->data;
637   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
638   PetscInt       N     =fft->N;
639   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
640   PetscInt       low;
641   PetscInt       rank,size,vsize,vsize1;
642   ptrdiff_t      alloc_local,local_n0,local_0_start;
643   ptrdiff_t      local_n1,local_1_start;
644   VecScatter     vecscat;
645   IS             list1,list2;
646 #if !defined(PETSC_USE_COMPLEX)
647   PetscInt       i,j,k,partial_dim;
648   PetscInt       *indx1, *indx2, tempindx, tempindx1;
649   PetscInt       N1, n1,NM;
650   ptrdiff_t      temp;
651 #endif
652 
653   PetscFunctionBegin;
654   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
655   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
656   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
657   ierr = VecGetOwnershipRange(y,&low,NULL);
658 
659   if (size==1) {
660     ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
661     ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
662     ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
663     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
664     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
665     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
666     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
667     ierr = ISDestroy(&list1);CHKERRQ(ierr);
668   } else {
669     switch (ndim) {
670     case 1:
671 #if defined(PETSC_USE_COMPLEX)
672       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
673 
674       ierr = ISCreateStride(comm,local_n0,local_0_start,1,&list1);
675       ierr = ISCreateStride(comm,local_n0,low,1,&list2);
676       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
677       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
678       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
679       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
680       ierr = ISDestroy(&list1);CHKERRQ(ierr);
681       ierr = ISDestroy(&list2);CHKERRQ(ierr);
682 #else
683       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
684 #endif
685       break;
686     case 2:
687 #if defined(PETSC_USE_COMPLEX)
688       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
689 
690       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
691       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
692       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
693       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
694       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
695       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
696       ierr = ISDestroy(&list1);CHKERRQ(ierr);
697       ierr = ISDestroy(&list2);CHKERRQ(ierr);
698 #else
699       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
700 
701       N1   = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
702       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
703       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
704 
705       if (dim[1]%2==0) {
706         NM = dim[1]+2;
707       } else {
708         NM = dim[1]+1;
709       }
710       for (i=0; i<local_n0; i++) {
711         for (j=0; j<dim[1]; j++) {
712           tempindx  = i*dim[1] + j;
713           tempindx1 = i*NM + j;
714 
715           indx1[tempindx]=local_0_start*dim[1]+tempindx;
716           indx2[tempindx]=low+tempindx1;
717         }
718       }
719 
720       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
721       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
722 
723       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
724       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
725       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
726       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
727       ierr = ISDestroy(&list1);CHKERRQ(ierr);
728       ierr = ISDestroy(&list2);CHKERRQ(ierr);
729       ierr = PetscFree(indx1);CHKERRQ(ierr);
730       ierr = PetscFree(indx2);CHKERRQ(ierr);
731 #endif
732       break;
733 
734     case 3:
735 #if defined(PETSC_USE_COMPLEX)
736       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
737 
738       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
739       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
740       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
741       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
742       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
743       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
744       ierr = ISDestroy(&list1);CHKERRQ(ierr);
745       ierr = ISDestroy(&list2);CHKERRQ(ierr);
746 #else
747       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
748 
749       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
750 
751       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
752       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
753 
754       if (dim[2]%2==0) NM = dim[2]+2;
755       else             NM = dim[2]+1;
756 
757       for (i=0; i<local_n0; i++) {
758         for (j=0; j<dim[1]; j++) {
759           for (k=0; k<dim[2]; k++) {
760             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
761             tempindx1 = i*dim[1]*NM + j*NM + k;
762 
763             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
764             indx2[tempindx]=low+tempindx1;
765           }
766         }
767       }
768 
769       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
770       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
771       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
772       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
773       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
774       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
775       ierr = ISDestroy(&list1);CHKERRQ(ierr);
776       ierr = ISDestroy(&list2);CHKERRQ(ierr);
777       ierr = PetscFree(indx1);CHKERRQ(ierr);
778       ierr = PetscFree(indx2);CHKERRQ(ierr);
779 #endif
780       break;
781 
782     default:
783 #if defined(PETSC_USE_COMPLEX)
784       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
785 
786       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
787       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
788       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
789       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
790       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
791       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
792       ierr = ISDestroy(&list1);CHKERRQ(ierr);
793       ierr = ISDestroy(&list2);CHKERRQ(ierr);
794 #else
795       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
796 
797       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
798 
799       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
800 
801       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
802 
803       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
804 
805       partial_dim = fftw->partial_dim;
806 
807       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
808       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
809 
810       if (dim[ndim-1]%2==0) NM = 2;
811       else                  NM = 1;
812 
813       j = low;
814       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
815         indx1[i] = local_0_start*partial_dim + i;
816         indx2[i] = j;
817         if (k%dim[ndim-1]==0) j+=NM;
818         j++;
819       }
820       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
821       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
822       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
823       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
824       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
825       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
826       ierr = ISDestroy(&list1);CHKERRQ(ierr);
827       ierr = ISDestroy(&list2);CHKERRQ(ierr);
828       ierr = PetscFree(indx1);CHKERRQ(ierr);
829       ierr = PetscFree(indx2);CHKERRQ(ierr);
830 #endif
831       break;
832     }
833   }
834   PetscFunctionReturn(0);
835 }
836 
837 #undef __FUNCT__
838 #define __FUNCT__ "VecScatterFFTWToPetsc"
839 /*@
840    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
841 
842    Collective on Mat
843 
844     Input Parameters:
845 +   A - FFTW matrix
846 -   x - FFTW vector
847 
848    Output Parameters:
849 .  y - PETSc vector
850 
851    Level: intermediate
852 
853    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
854          VecScatterFFTWToPetsc removes those extra zeros.
855 
856 .seealso: VecScatterPetscToFFTW()
857 @*/
858 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
859 {
860   PetscErrorCode ierr;
861 
862   PetscFunctionBegin;
863   ierr = PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 
867 #undef __FUNCT__
868 #define __FUNCT__ "VecScatterFFTWToPetsc_FFTW"
869 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
870 {
871   PetscErrorCode ierr;
872   MPI_Comm       comm;
873   Mat_FFT        *fft  = (Mat_FFT*)A->data;
874   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
875   PetscInt       N     = fft->N;
876   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
877   PetscInt       low;
878   PetscInt       rank,size;
879   ptrdiff_t      alloc_local,local_n0,local_0_start;
880   ptrdiff_t      local_n1,local_1_start;
881   VecScatter     vecscat;
882   IS             list1,list2;
883 #if !defined(PETSC_USE_COMPLEX)
884   PetscInt  i,j,k,partial_dim;
885   PetscInt  *indx1, *indx2, tempindx, tempindx1;
886   PetscInt  N1, n1,NM;
887   ptrdiff_t temp;
888 #endif
889 
890   PetscFunctionBegin;
891   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
892   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
893   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
894   ierr = VecGetOwnershipRange(x,&low,NULL);CHKERRQ(ierr);
895 
896   if (size==1) {
897     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
898     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
899     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
900     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
901     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
902     ierr = ISDestroy(&list1);CHKERRQ(ierr);
903 
904   } else {
905     switch (ndim) {
906     case 1:
907 #if defined(PETSC_USE_COMPLEX)
908       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
909 
910       ierr = ISCreateStride(comm,local_n1,local_1_start,1,&list1);
911       ierr = ISCreateStride(comm,local_n1,low,1,&list2);
912       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
913       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
914       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
915       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
916       ierr = ISDestroy(&list1);CHKERRQ(ierr);
917       ierr = ISDestroy(&list2);CHKERRQ(ierr);
918 #else
919       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
920 #endif
921       break;
922     case 2:
923 #if defined(PETSC_USE_COMPLEX)
924       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
925 
926       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
927       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
928       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
929       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
930       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
931       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
932       ierr = ISDestroy(&list1);CHKERRQ(ierr);
933       ierr = ISDestroy(&list2);CHKERRQ(ierr);
934 #else
935       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
936 
937       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
938 
939       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
940       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
941 
942       if (dim[1]%2==0) NM = dim[1]+2;
943       else             NM = dim[1]+1;
944 
945       for (i=0; i<local_n0; i++) {
946         for (j=0; j<dim[1]; j++) {
947           tempindx = i*dim[1] + j;
948           tempindx1 = i*NM + j;
949 
950           indx1[tempindx]=local_0_start*dim[1]+tempindx;
951           indx2[tempindx]=low+tempindx1;
952         }
953       }
954 
955       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
956       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
957 
958       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
959       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
960       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
961       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
962       ierr = ISDestroy(&list1);CHKERRQ(ierr);
963       ierr = ISDestroy(&list2);CHKERRQ(ierr);
964       ierr = PetscFree(indx1);CHKERRQ(ierr);
965       ierr = PetscFree(indx2);CHKERRQ(ierr);
966 #endif
967       break;
968     case 3:
969 #if defined(PETSC_USE_COMPLEX)
970       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
971 
972       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
973       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
974       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
975       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
976       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
977       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
978       ierr = ISDestroy(&list1);CHKERRQ(ierr);
979       ierr = ISDestroy(&list2);CHKERRQ(ierr);
980 #else
981 
982       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
983 
984       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
985 
986       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
987       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
988 
989       if (dim[2]%2==0) NM = dim[2]+2;
990       else             NM = dim[2]+1;
991 
992       for (i=0; i<local_n0; i++) {
993         for (j=0; j<dim[1]; j++) {
994           for (k=0; k<dim[2]; k++) {
995             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
996             tempindx1 = i*dim[1]*NM + j*NM + k;
997 
998             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
999             indx2[tempindx]=low+tempindx1;
1000           }
1001         }
1002       }
1003 
1004       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1005       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1006 
1007       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1008       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1009       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1010       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1011       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1012       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1013       ierr = PetscFree(indx1);CHKERRQ(ierr);
1014       ierr = PetscFree(indx2);CHKERRQ(ierr);
1015 #endif
1016       break;
1017     default:
1018 #if defined(PETSC_USE_COMPLEX)
1019       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
1020 
1021       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1022       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1023       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1024       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1025       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1026       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1027       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1028       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1029 #else
1030       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
1031 
1032       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
1033 
1034       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1035 
1036       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
1037 
1038       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1039 
1040       partial_dim = fftw->partial_dim;
1041 
1042       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
1043       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
1044 
1045       if (dim[ndim-1]%2==0) NM = 2;
1046       else                  NM = 1;
1047 
1048       j = low;
1049       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1050         indx1[i] = local_0_start*partial_dim + i;
1051         indx2[i] = j;
1052         if (k%dim[ndim-1]==0) j+=NM;
1053         j++;
1054       }
1055       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1056       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1057 
1058       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1059       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1060       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1061       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1062       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1063       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1064       ierr = PetscFree(indx1);CHKERRQ(ierr);
1065       ierr = PetscFree(indx2);CHKERRQ(ierr);
1066 #endif
1067       break;
1068     }
1069   }
1070   PetscFunctionReturn(0);
1071 }
1072 
1073 #undef __FUNCT__
1074 #define __FUNCT__ "MatCreate_FFTW"
1075 /*
1076     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1077 
1078   Options Database Keys:
1079 + -mat_fftw_plannerflags - set FFTW planner flags
1080 
1081    Level: intermediate
1082 
1083 */
1084 PETSC_EXTERN PetscErrorCode MatCreate_FFTW(Mat A)
1085 {
1086   PetscErrorCode ierr;
1087   MPI_Comm       comm;
1088   Mat_FFT        *fft=(Mat_FFT*)A->data;
1089   Mat_FFTW       *fftw;
1090   PetscInt       n         =fft->n,N=fft->N,ndim=fft->ndim,*dim = fft->dim;
1091   const char     *p_flags[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1092   PetscBool      flg;
1093   PetscInt       p_flag,partial_dim=1,ctr;
1094   PetscMPIInt    size,rank;
1095   ptrdiff_t      *pdim;
1096   ptrdiff_t      local_n1,local_1_start;
1097 #if !defined(PETSC_USE_COMPLEX)
1098   ptrdiff_t temp;
1099   PetscInt  N1,tot_dim;
1100 #else
1101   PetscInt n1;
1102 #endif
1103 
1104   PetscFunctionBegin;
1105   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1106   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
1107   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1108 
1109   fftw_mpi_init();
1110   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1111   pdim[0] = dim[0];
1112 #if !defined(PETSC_USE_COMPLEX)
1113   tot_dim = 2*dim[0];
1114 #endif
1115   for (ctr=1; ctr<ndim; ctr++) {
1116     partial_dim *= dim[ctr];
1117     pdim[ctr]    = dim[ctr];
1118 #if !defined(PETSC_USE_COMPLEX)
1119     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1120     else             tot_dim *= dim[ctr];
1121 #endif
1122   }
1123 
1124   if (size == 1) {
1125 #if defined(PETSC_USE_COMPLEX)
1126     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1127     n    = N;
1128 #else
1129     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1130     n    = tot_dim;
1131 #endif
1132 
1133   } else {
1134     ptrdiff_t alloc_local,local_n0,local_0_start;
1135     switch (ndim) {
1136     case 1:
1137 #if !defined(PETSC_USE_COMPLEX)
1138       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1139 #else
1140       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1141 
1142       n    = (PetscInt)local_n0;
1143       n1   = (PetscInt)local_n1;
1144       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1145 #endif
1146       break;
1147     case 2:
1148 #if defined(PETSC_USE_COMPLEX)
1149       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1150       /*
1151        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1152        PetscSynchronizedFlush(comm,PETSC_STDOUT);
1153        */
1154       n    = (PetscInt)local_n0*dim[1];
1155       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1156 #else
1157       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1158 
1159       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1160       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1161 #endif
1162       break;
1163     case 3:
1164 #if defined(PETSC_USE_COMPLEX)
1165       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1166 
1167       n    = (PetscInt)local_n0*dim[1]*dim[2];
1168       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1169 #else
1170       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1171 
1172       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1173       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1174 #endif
1175       break;
1176     default:
1177 #if defined(PETSC_USE_COMPLEX)
1178       alloc_local = fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1179 
1180       n    = (PetscInt)local_n0*partial_dim;
1181       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1182 #else
1183       temp = pdim[ndim-1];
1184 
1185       pdim[ndim-1] = temp/2 + 1;
1186 
1187       alloc_local = fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1188 
1189       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1190       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1191 
1192       pdim[ndim-1] = temp;
1193 
1194       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1195 #endif
1196       break;
1197     }
1198   }
1199   ierr      = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1200   ierr      = PetscNewLog(A,Mat_FFTW,&fftw);CHKERRQ(ierr);
1201   fft->data = (void*)fftw;
1202 
1203   fft->n            = n;
1204   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1205   fftw->partial_dim = partial_dim;
1206 
1207   ierr = PetscMalloc(ndim*sizeof(ptrdiff_t), (ptrdiff_t*)&(fftw->dim_fftw));CHKERRQ(ierr);
1208 
1209   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1210 
1211   fftw->p_forward  = 0;
1212   fftw->p_backward = 0;
1213   fftw->p_flag     = FFTW_ESTIMATE;
1214 
1215   if (size == 1) {
1216     A->ops->mult          = MatMult_SeqFFTW;
1217     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1218   } else {
1219     A->ops->mult          = MatMult_MPIFFTW;
1220     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1221   }
1222   fft->matdestroy = MatDestroy_FFTW;
1223   A->assembled    = PETSC_TRUE;
1224   A->preallocated = PETSC_TRUE;
1225 
1226   ierr = PetscObjectComposeFunction((PetscObject)A,"MatGetVecsFFTW_C",MatGetVecsFFTW_FFTW);CHKERRQ(ierr);
1227   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterPetscToFFTW_C",VecScatterPetscToFFTW_FFTW);CHKERRQ(ierr);
1228   ierr = PetscObjectComposeFunction((PetscObject)A,"VecScatterFFTWToPetsc_C",VecScatterFFTWToPetsc_FFTW);CHKERRQ(ierr);
1229 
1230   /* get runtime options */
1231   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1232   ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",p_flags,4,p_flags[0],&p_flag,&flg);CHKERRQ(ierr);
1233   if (flg) fftw->p_flag = (unsigned)p_flag;
1234   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1235   PetscFunctionReturn(0);
1236 }
1237 
1238 
1239 
1240 
1241