xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision 84df9cb40eca90ea9b18a456fab7a4ecc7f6c1a4)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/examples/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #include <fftw3-mpi.h>
10 EXTERN_C_END
11 
12 typedef struct {
13   ptrdiff_t   ndim_fftw,*dim_fftw;
14   PetscInt    partial_dim;
15   fftw_plan   p_forward,p_backward;
16   unsigned    p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
17   PetscScalar *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
18                                                             executed for the arrays with which the plan was created */
19 } Mat_FFTW;
20 
21 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
22 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
23 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
24 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
25 extern PetscErrorCode MatDestroy_FFTW(Mat);
26 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
27 
28 /* MatMult_SeqFFTW performs forward DFT in parallel
29    Input parameter:
30      A - the matrix
31      x - the vector on which FDFT will be performed
32 
33    Output parameter:
34      y - vector that stores result of FDFT
35 */
36 #undef __FUNCT__
37 #define __FUNCT__ "MatMult_SeqFFTW"
38 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
39 {
40   PetscErrorCode ierr;
41   Mat_FFT        *fft  = (Mat_FFT*)A->data;
42   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
43   PetscScalar    *x_array,*y_array;
44   PetscInt       ndim=fft->ndim,*dim=fft->dim;
45 
46   PetscFunctionBegin;
47   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
48   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
49   if (!fftw->p_forward){ /* create a plan, then excute it */
50     switch (ndim){
51     case 1:
52 #if defined(PETSC_USE_COMPLEX)
53       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
54 #else
55       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
56 #endif
57       break;
58     case 2:
59 #if defined(PETSC_USE_COMPLEX)
60       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
61 #else
62       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
63 #endif
64       break;
65     case 3:
66 #if defined(PETSC_USE_COMPLEX)
67       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
68 #else
69       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
70 #endif
71       break;
72     default:
73 #if defined(PETSC_USE_COMPLEX)
74       fftw->p_forward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
75 #else
76       fftw->p_forward = fftw_plan_dft_r2c(ndim,dim,(double *)x_array,(fftw_complex*)y_array,fftw->p_flag);
77 #endif
78       break;
79     }
80     fftw->finarray  = x_array;
81     fftw->foutarray = y_array;
82     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
83                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
84     fftw_execute(fftw->p_forward);
85   } else { /* use existing plan */
86     if (fftw->finarray != x_array || fftw->foutarray != y_array){ /* use existing plan on new arrays */
87       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
88     } else {
89       fftw_execute(fftw->p_forward);
90     }
91   }
92   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
93   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
94   PetscFunctionReturn(0);
95 }
96 
97 /* MatMultTranspose_SeqFFTW performs serial backward DFT
98    Input parameter:
99      A - the matrix
100      x - the vector on which BDFT will be performed
101 
102    Output parameter:
103      y - vector that stores result of BDFT
104 */
105 
106 #undef __FUNCT__
107 #define __FUNCT__ "MatMultTranspose_SeqFFTW"
108 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
109 {
110   PetscErrorCode ierr;
111   Mat_FFT        *fft = (Mat_FFT*)A->data;
112   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
113   PetscScalar    *x_array,*y_array;
114   PetscInt       ndim=fft->ndim,*dim=fft->dim;
115 
116   PetscFunctionBegin;
117   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
118   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
119   if (!fftw->p_backward){ /* create a plan, then excute it */
120     switch (ndim){
121     case 1:
122 #if defined(PETSC_USE_COMPLEX)
123       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
124 #else
125       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
126 #endif
127       break;
128     case 2:
129 #if defined(PETSC_USE_COMPLEX)
130       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
131 #else
132       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
133 #endif
134       break;
135     case 3:
136 #if defined(PETSC_USE_COMPLEX)
137       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
138 #else
139       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
140 #endif
141       break;
142     default:
143 #if defined(PETSC_USE_COMPLEX)
144       fftw->p_backward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
145 #else
146       fftw->p_backward= fftw_plan_dft_c2r(ndim,dim,(fftw_complex*)x_array,(double *)y_array,fftw->p_flag);
147 #endif
148       break;
149     }
150     fftw->binarray  = x_array;
151     fftw->boutarray = y_array;
152     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
153   } else { /* use existing plan */
154     if (fftw->binarray != x_array || fftw->boutarray != y_array){ /* use existing plan on new arrays */
155       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
156     } else {
157       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
158     }
159   }
160   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
161   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
162   PetscFunctionReturn(0);
163 }
164 
165 /* MatMult_MPIFFTW performs forward DFT in parallel
166    Input parameter:
167      A - the matrix
168      x - the vector on which FDFT will be performed
169 
170    Output parameter:
171    y   - vector that stores result of FDFT
172 */
173 #undef __FUNCT__
174 #define __FUNCT__ "MatMult_MPIFFTW"
175 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
176 {
177   PetscErrorCode ierr;
178   Mat_FFT        *fft  = (Mat_FFT*)A->data;
179   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
180   PetscScalar    *x_array,*y_array;
181   PetscInt       ndim=fft->ndim,*dim=fft->dim;
182   MPI_Comm       comm=((PetscObject)A)->comm;
183 
184   PetscFunctionBegin;
185   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
186   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
187   if (!fftw->p_forward){ /* create a plan, then excute it */
188     switch (ndim){
189     case 1:
190 #if defined(PETSC_USE_COMPLEX)
191       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
192 #else
193       SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"not support for real numbers yet");
194 #endif
195       break;
196     case 2:
197 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
198       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
199 #else
200       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
201 #endif
202       break;
203     case 3:
204 #if defined(PETSC_USE_COMPLEX)
205       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
206 #else
207       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
208 #endif
209       break;
210     default:
211 #if defined(PETSC_USE_COMPLEX)
212       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
213 #else
214       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double *)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
215 #endif
216       break;
217     }
218     fftw->finarray  = x_array;
219     fftw->foutarray = y_array;
220     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
221                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
222     fftw_execute(fftw->p_forward);
223   } else { /* use existing plan */
224     if (fftw->finarray != x_array || fftw->foutarray != y_array){ /* use existing plan on new arrays */
225       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
226     } else {
227       fftw_execute(fftw->p_forward);
228     }
229   }
230   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
231   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
232   PetscFunctionReturn(0);
233 }
234 
235 /* MatMultTranspose_MPIFFTW performs parallel backward DFT
236    Input parameter:
237      A - the matrix
238      x - the vector on which BDFT will be performed
239 
240    Output parameter:
241      y - vector that stores result of BDFT
242 */
243 #undef __FUNCT__
244 #define __FUNCT__ "MatMultTranspose_MPIFFTW"
245 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
246 {
247   PetscErrorCode ierr;
248   Mat_FFT        *fft  = (Mat_FFT*)A->data;
249   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
250   PetscScalar    *x_array,*y_array;
251   PetscInt       ndim=fft->ndim,*dim=fft->dim;
252   MPI_Comm       comm=((PetscObject)A)->comm;
253 
254   PetscFunctionBegin;
255   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
256   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
257   if (!fftw->p_backward){ /* create a plan, then excute it */
258     switch (ndim){
259     case 1:
260 #if defined(PETSC_USE_COMPLEX)
261       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
262 #else
263       SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"not support for real numbers yet");
264 #endif
265       break;
266     case 2:
267 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
268       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
269 #else
270       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
271 #endif
272       break;
273     case 3:
274 #if defined(PETSC_USE_COMPLEX)
275       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
276 #else
277       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
278 #endif
279       break;
280     default:
281 #if defined(PETSC_USE_COMPLEX)
282       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
283 #else
284       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double *)y_array,comm,FFTW_ESTIMATE);
285 #endif
286       break;
287     }
288     fftw->binarray  = x_array;
289     fftw->boutarray = y_array;
290     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
291   } else { /* use existing plan */
292     if (fftw->binarray != x_array || fftw->boutarray != y_array){ /* use existing plan on new arrays */
293       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
294     } else {
295       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
296     }
297   }
298   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
299   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
300   PetscFunctionReturn(0);
301 }
302 
303 #undef __FUNCT__
304 #define __FUNCT__ "MatDestroy_FFTW"
305 PetscErrorCode MatDestroy_FFTW(Mat A)
306 {
307   Mat_FFT        *fft = (Mat_FFT*)A->data;
308   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
309   PetscErrorCode ierr;
310 
311   PetscFunctionBegin;
312   fftw_destroy_plan(fftw->p_forward);
313   fftw_destroy_plan(fftw->p_backward);
314   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
315   ierr = PetscFree(fft->data);CHKERRQ(ierr);
316   PetscFunctionReturn(0);
317 }
318 
319 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
320 #undef __FUNCT__
321 #define __FUNCT__ "VecDestroy_MPIFFTW"
322 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
323 {
324   PetscErrorCode  ierr;
325   PetscScalar     *array;
326 
327   PetscFunctionBegin;
328   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
329   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
330   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
331   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
332   PetscFunctionReturn(0);
333 }
334 
335 #undef __FUNCT__
336 #define __FUNCT__ "MatGetVecsFFTW"
337 /*@
338    MatGetVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
339      parallel layout determined by FFTW
340 
341    Collective on Mat
342 
343    Input Parameter:
344 .   A - the matrix
345 
346    Output Parameter:
347 +   fin - (optional) input vector of forward FFTW
348 -   fout - (optional) output vector of forward FFTW
349 -   bout - (optional) output vector of backward FFTW
350 
351   Level: advanced
352 
353   Note: The parallel layout of output of forward FFTW is always same as the input
354         of the backward FFTW. But parallel layout of the input vector of forward
355         FFTW might not be same as the output of backward FFTW.
356         Also note that we need to provide enough space while doing parallel real transform.
357         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
358         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
359         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
360         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
361         zeros if it is odd only one zero is needed.
362         Lastly one needs some scratch space at the end of data set in each process. alloc_local
363         figures out how much space is needed, i.e. it figures out the data+scratch space for
364         each processor and returns that.
365 
366 .seealso: MatCreateFFTW()
367 @*/
368 PetscErrorCode MatGetVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
369 {
370   PetscErrorCode ierr;
371 
372   PetscFunctionBegin;
373   ierr = PetscTryMethod(A,"MatGetVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
374   PetscFunctionReturn(0);
375 }
376 
377 EXTERN_C_BEGIN
378 #undef __FUNCT__
379 #define __FUNCT__ "MatGetVecsFFTW_FFTW"
380 PetscErrorCode  MatGetVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
381 {
382   PetscErrorCode ierr;
383   PetscMPIInt    size,rank;
384   MPI_Comm       comm=((PetscObject)A)->comm;
385   Mat_FFT        *fft = (Mat_FFT*)A->data;
386   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
387   PetscInt       N=fft->N;
388   PetscInt       ndim=fft->ndim,*dim=fft->dim,n=fft->n;
389 
390   PetscFunctionBegin;
391   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
392   PetscValidType(A,1);
393 
394   ierr = MPI_Comm_size(PETSC_COMM_WORLD, &size);CHKERRQ(ierr);
395   ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);CHKERRQ(ierr);
396   if (size == 1){ /* sequential case */
397 #if defined(PETSC_USE_COMPLEX)
398     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
399     if (fout){ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
400     if (bout){ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
401 #else
402     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
403     if (fout){ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
404     if (bout){ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
405 #endif
406   } else { /* parallel cases */
407     ptrdiff_t      alloc_local,local_n0,local_0_start;
408     ptrdiff_t      local_n1;
409     fftw_complex   *data_fout;
410     ptrdiff_t      local_1_start;
411 #if defined(PETSC_USE_COMPLEX)
412     fftw_complex   *data_fin,*data_bout;
413 #else
414     double         *data_finr,*data_boutr;
415     PetscInt       n1,N1,vsize;
416     ptrdiff_t      temp;
417 #endif
418 
419     switch (ndim){
420     case 1:
421 #if !defined(PETSC_USE_COMPLEX)
422       SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
423 #else
424       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
425       if (fin) {
426         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
427         ierr = VecCreateMPIWithArray(comm,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
428         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
429       }
430       if (fout) {
431         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
432         ierr = VecCreateMPIWithArray(comm,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
433         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
434       }
435       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
436       if (bout) {
437         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
438         ierr = VecCreateMPIWithArray(comm,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
439         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
440       }
441       break;
442 #endif
443     case 2:
444 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
445       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
446       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
447       if (fin) {
448         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
449         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
450         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
451       }
452       if (bout) {
453         data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
454         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
455         ierr = VecGetSize(*bout,&vsize);CHKERRQ(ierr);
456         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
457       }
458       if (fout) {
459         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
460         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
461         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
462       }
463 #else
464       /* Get local size */
465       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
466       if (fin) {
467         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
468         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
469         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
470       }
471       if (fout) {
472         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
473         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
474         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
475       }
476       if (bout) {
477         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
478         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
479         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
480       }
481 #endif
482       break;
483     case 3:
484 #if !defined(PETSC_USE_COMPLEX)
485       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
486       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
487       if (fin) {
488         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
489         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
490         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
491       }
492       if (bout) {
493         data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
494         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
495         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
496       }
497 
498       if (fout) {
499         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
500         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
501         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
502       }
503 #else
504       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
505       if (fin) {
506         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
507         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
508         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
509       }
510       if (fout) {
511         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
512         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
513         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
514       }
515       if (bout) {
516         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
517         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
518         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
519       }
520 #endif
521       break;
522     default:
523 #if !defined(PETSC_USE_COMPLEX)
524       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
525       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
526       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
527       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
528       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
529 
530       if (fin) {
531         data_finr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
532         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
533         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
534       }
535       if (bout) {
536         data_boutr=(double *)fftw_malloc(sizeof(double)*alloc_local*2);
537         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
538         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
539       }
540 
541       if (fout) {
542         data_fout=(fftw_complex *)fftw_malloc(sizeof(fftw_complex)*alloc_local);
543         ierr = VecCreateMPIWithArray(PETSC_COMM_WORLD,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
544         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
545       }
546 #else
547       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
548       if (fin) {
549         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
550         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
551         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
552       }
553       if (fout) {
554         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
555         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
556         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
557       }
558       if (bout) {
559         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
560         ierr = VecCreateMPIWithArray(comm,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
561         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
562       }
563 #endif
564       break;
565     }
566   }
567   PetscFunctionReturn(0);
568 }
569 EXTERN_C_END
570 
571 #undef __FUNCT__
572 #define __FUNCT__ "VecScatterPetscToFFTW"
573 /*@
574    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
575 
576    Collective on Mat
577 
578    Input Parameters:
579 +  A - FFTW matrix
580 -  x - the PETSc vector
581 
582    Output Parameters:
583 .  y - the FFTW vector
584 
585   Options Database Keys:
586 . -mat_fftw_plannerflags - set FFTW planner flags
587 
588    Level: intermediate
589 
590    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
591          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
592          zeros. This routine does that job by scattering operation.
593 
594 .seealso: VecScatterFFTWToPetsc()
595 @*/
596 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
597 {
598   PetscErrorCode ierr;
599 
600   PetscFunctionBegin;
601   ierr = PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
602   PetscFunctionReturn(0);
603 }
604 
605 EXTERN_C_BEGIN
606 #undef __FUNCT__
607 #define __FUNCT__ "VecScatterPetscToFFTW_FFTW"
608 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
609 {
610   PetscErrorCode ierr;
611   MPI_Comm       comm=((PetscObject)A)->comm;
612   Mat_FFT        *fft = (Mat_FFT*)A->data;
613   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
614   PetscInt       N=fft->N;
615   PetscInt       ndim=fft->ndim,*dim=fft->dim;
616   PetscInt       low;
617   PetscInt       rank,size,vsize,vsize1;
618   ptrdiff_t      alloc_local,local_n0,local_0_start;
619   ptrdiff_t      local_n1,local_1_start;
620   VecScatter     vecscat;
621   IS             list1,list2;
622 #if !defined(PETSC_USE_COMPLEX)
623   PetscInt       i,j,k,partial_dim;
624   PetscInt       *indx1, *indx2, tempindx, tempindx1;
625   PetscInt       N1, n1 ,NM;
626   ptrdiff_t      temp;
627 #endif
628 
629   PetscFunctionBegin;
630   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
631   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
632   ierr = VecGetOwnershipRange(y,&low,PETSC_NULL);
633 
634   if (size==1){
635     ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
636     ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
637     ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
638     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
639     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
640     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
641     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
642     ierr = ISDestroy(&list1);CHKERRQ(ierr);
643   } else {
644     switch (ndim){
645     case 1:
646 #if defined(PETSC_USE_COMPLEX)
647       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
648       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0,local_0_start,1,&list1);
649       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0,low,1,&list2);
650       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
651       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
652       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
653       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
654       ierr = ISDestroy(&list1);CHKERRQ(ierr);
655       ierr = ISDestroy(&list2);CHKERRQ(ierr);
656 #else
657       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
658 #endif
659       break;
660     case 2:
661 #if defined(PETSC_USE_COMPLEX)
662       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
663       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1],local_0_start*dim[1],1,&list1);
664       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1],low,1,&list2);
665       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
666       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
667       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
668       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
669       ierr = ISDestroy(&list1);CHKERRQ(ierr);
670       ierr = ISDestroy(&list2);CHKERRQ(ierr);
671 #else
672       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
673       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
674       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
675       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
676 
677       if (dim[1]%2==0){
678         NM = dim[1]+2;
679       } else {
680         NM = dim[1]+1;
681       }
682       for (i=0;i<local_n0;i++){
683          for (j=0;j<dim[1];j++){
684            tempindx = i*dim[1] + j;
685            tempindx1 = i*NM + j;
686            indx1[tempindx]=local_0_start*dim[1]+tempindx;
687            indx2[tempindx]=low+tempindx1;
688          }
689       }
690 
691       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
692       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
693 
694       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
695       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
696       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
697       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
698       ierr = ISDestroy(&list1);CHKERRQ(ierr);
699       ierr = ISDestroy(&list2);CHKERRQ(ierr);
700       ierr = PetscFree(indx1);CHKERRQ(ierr);
701       ierr = PetscFree(indx2);CHKERRQ(ierr);
702 #endif
703       break;
704 
705     case 3:
706 #if defined(PETSC_USE_COMPLEX)
707       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
708       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
709       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1]*dim[2],low,1,&list2);
710       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
711       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
712       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
713       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
714       ierr = ISDestroy(&list1);CHKERRQ(ierr);
715       ierr = ISDestroy(&list2);CHKERRQ(ierr);
716 #else
717       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
718       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
719 
720       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
721       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
722 
723       if (dim[2]%2==0)
724         NM = dim[2]+2;
725       else
726         NM = dim[2]+1;
727 
728       for (i=0;i<local_n0;i++){
729         for (j=0;j<dim[1];j++){
730           for (k=0;k<dim[2];k++){
731             tempindx = i*dim[1]*dim[2] + j*dim[2] + k;
732             tempindx1 = i*dim[1]*NM + j*NM + k;
733             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
734             indx2[tempindx]=low+tempindx1;
735           }
736         }
737       }
738 
739       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
740       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
741       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
742       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
743       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
744       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
745       ierr = ISDestroy(&list1);CHKERRQ(ierr);
746       ierr = ISDestroy(&list2);CHKERRQ(ierr);
747       ierr = PetscFree(indx1);CHKERRQ(ierr);
748       ierr = PetscFree(indx2);CHKERRQ(ierr);
749 #endif
750       break;
751 
752     default:
753 #if defined(PETSC_USE_COMPLEX)
754       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
755       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
756       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*(fftw->partial_dim),low,1,&list2);
757       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
758       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
759       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
760       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
761       ierr = ISDestroy(&list1);CHKERRQ(ierr);
762       ierr = ISDestroy(&list2);CHKERRQ(ierr);
763 #else
764       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
765       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
766       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
767       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
768       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
769 
770       partial_dim = fftw->partial_dim;
771 
772       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
773       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
774 
775       if (dim[ndim-1]%2==0)
776         NM = 2;
777       else
778         NM = 1;
779 
780       j = low;
781       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++){
782         indx1[i] = local_0_start*partial_dim + i;
783         indx2[i] = j;
784         if (k%dim[ndim-1]==0){ j+=NM;}
785         j++;
786       }
787       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
788       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
789       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
790       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
791       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
792       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
793       ierr = ISDestroy(&list1);CHKERRQ(ierr);
794       ierr = ISDestroy(&list2);CHKERRQ(ierr);
795       ierr = PetscFree(indx1);CHKERRQ(ierr);
796       ierr = PetscFree(indx2);CHKERRQ(ierr);
797 #endif
798       break;
799     }
800   }
801   PetscFunctionReturn(0);
802 }
803 EXTERN_C_END
804 
805 #undef __FUNCT__
806 #define __FUNCT__ "VecScatterFFTWToPetsc"
807 /*@
808    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
809 
810    Collective on Mat
811 
812     Input Parameters:
813 +   A - FFTW matrix
814 -   x - FFTW vector
815 
816    Output Parameters:
817 .  y - PETSc vector
818 
819    Level: intermediate
820 
821    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
822          VecScatterFFTWToPetsc removes those extra zeros.
823 
824 .seealso: VecScatterPetscToFFTW()
825 @*/
826 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
827 {
828   PetscErrorCode ierr;
829   PetscFunctionBegin;
830   ierr = PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
831   PetscFunctionReturn(0);
832 }
833 
834 EXTERN_C_BEGIN
835 #undef __FUNCT__
836 #define __FUNCT__ "VecScatterFFTWToPetsc_FFTW"
837 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
838 {
839   PetscErrorCode ierr;
840   MPI_Comm       comm=((PetscObject)A)->comm;
841   Mat_FFT        *fft = (Mat_FFT*)A->data;
842   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
843   PetscInt       N=fft->N;
844   PetscInt       ndim=fft->ndim,*dim=fft->dim;
845   PetscInt       low;
846   PetscInt       rank,size;
847   ptrdiff_t      alloc_local,local_n0,local_0_start;
848   ptrdiff_t      local_n1,local_1_start;
849   VecScatter     vecscat;
850   IS             list1,list2;
851 #if !defined(PETSC_USE_COMPLEX)
852   PetscInt       i,j,k,partial_dim;
853   PetscInt       *indx1, *indx2, tempindx, tempindx1;
854   PetscInt       N1, n1 ,NM;
855   ptrdiff_t      temp;
856 #endif
857 
858   PetscFunctionBegin;
859   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
860   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
861   ierr = VecGetOwnershipRange(x,&low,PETSC_NULL);CHKERRQ(ierr);
862 
863   if (size==1){
864     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
865     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
866     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
867     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
868     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
869     ierr = ISDestroy(&list1);CHKERRQ(ierr);
870 
871   } else{
872     switch (ndim){
873     case 1:
874 #if defined(PETSC_USE_COMPLEX)
875       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
876       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n1,local_1_start,1,&list1);
877       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n1,low,1,&list2);
878       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
879       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
880       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
881       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
882       ierr = ISDestroy(&list1);CHKERRQ(ierr);
883       ierr = ISDestroy(&list2);CHKERRQ(ierr);
884 #else
885       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
886 #endif
887       break;
888     case 2:
889 #if defined(PETSC_USE_COMPLEX)
890       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
891       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1],local_0_start*dim[1],1,&list1);
892       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1],low,1,&list2);
893       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
894       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
895       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
896       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
897       ierr = ISDestroy(&list1);CHKERRQ(ierr);
898       ierr = ISDestroy(&list2);CHKERRQ(ierr);
899 #else
900       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
901       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
902 
903       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
904       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
905 
906       if (dim[1]%2==0)
907         NM = dim[1]+2;
908       else
909         NM = dim[1]+1;
910 
911       for (i=0;i<local_n0;i++){
912         for (j=0;j<dim[1];j++){
913           tempindx = i*dim[1] + j;
914           tempindx1 = i*NM + j;
915           indx1[tempindx]=local_0_start*dim[1]+tempindx;
916           indx2[tempindx]=low+tempindx1;
917         }
918       }
919 
920       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
921       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
922 
923       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
924       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
925       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
926       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
927       ierr = ISDestroy(&list1);CHKERRQ(ierr);
928       ierr = ISDestroy(&list2);CHKERRQ(ierr);
929       ierr = PetscFree(indx1);CHKERRQ(ierr);
930       ierr = PetscFree(indx2);CHKERRQ(ierr);
931 #endif
932       break;
933     case 3:
934 #if defined(PETSC_USE_COMPLEX)
935       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
936       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
937       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*dim[1]*dim[2],low,1,&list2);
938       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
939       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
940       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
941       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
942       ierr = ISDestroy(&list1);CHKERRQ(ierr);
943       ierr = ISDestroy(&list2);CHKERRQ(ierr);
944 #else
945 
946       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
947       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
948 
949       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
950       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
951 
952       if (dim[2]%2==0)
953         NM = dim[2]+2;
954       else
955         NM = dim[2]+1;
956 
957       for (i=0;i<local_n0;i++){
958         for (j=0;j<dim[1];j++){
959           for (k=0;k<dim[2];k++){
960             tempindx = i*dim[1]*dim[2] + j*dim[2] + k;
961             tempindx1 = i*dim[1]*NM + j*NM + k;
962             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
963             indx2[tempindx]=low+tempindx1;
964           }
965         }
966       }
967 
968       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
969       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
970 
971       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
972       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
973       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
974       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
975       ierr = ISDestroy(&list1);CHKERRQ(ierr);
976       ierr = ISDestroy(&list2);CHKERRQ(ierr);
977       ierr = PetscFree(indx1);CHKERRQ(ierr);
978       ierr = PetscFree(indx2);CHKERRQ(ierr);
979 #endif
980       break;
981     default:
982 #if defined(PETSC_USE_COMPLEX)
983       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
984       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
985       ierr = ISCreateStride(PETSC_COMM_WORLD,local_n0*(fftw->partial_dim),low,1,&list2);
986       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
987       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
988       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
989       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
990       ierr = ISDestroy(&list1);CHKERRQ(ierr);
991       ierr = ISDestroy(&list2);CHKERRQ(ierr);
992 #else
993       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
994       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
995       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
996       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
997       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
998 
999       partial_dim = fftw->partial_dim;
1000 
1001       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
1002       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
1003 
1004       if (dim[ndim-1]%2==0)
1005         NM = 2;
1006       else
1007         NM = 1;
1008 
1009       j = low;
1010       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++){
1011         indx1[i] = local_0_start*partial_dim + i;
1012         indx2[i] = j;
1013         if (k%dim[ndim-1]==0)j+=NM;
1014         j++;
1015       }
1016       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1017       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1018 
1019       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1020       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1021       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1022       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1023       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1024       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1025       ierr = PetscFree(indx1);CHKERRQ(ierr);
1026       ierr = PetscFree(indx2);CHKERRQ(ierr);
1027 #endif
1028       break;
1029     }
1030   }
1031   PetscFunctionReturn(0);
1032 }
1033 EXTERN_C_END
1034 
1035 EXTERN_C_BEGIN
1036 #undef __FUNCT__
1037 #define __FUNCT__ "MatCreate_FFTW"
1038 /*
1039     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1040 
1041   Options Database Keys:
1042 + -mat_fftw_plannerflags - set FFTW planner flags
1043 
1044    Level: intermediate
1045 
1046 */
1047 PetscErrorCode MatCreate_FFTW(Mat A)
1048 {
1049   PetscErrorCode ierr;
1050   MPI_Comm       comm=((PetscObject)A)->comm;
1051   Mat_FFT        *fft=(Mat_FFT*)A->data;
1052   Mat_FFTW       *fftw;
1053   PetscInt       n=fft->n,N=fft->N,ndim=fft->ndim,*dim = fft->dim;
1054   const char     *p_flags[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1055   PetscBool      flg;
1056   PetscInt       p_flag,partial_dim=1,ctr;
1057   PetscMPIInt    size,rank;
1058   ptrdiff_t      *pdim;
1059   ptrdiff_t      local_n1,local_1_start;
1060 #if !defined(PETSC_USE_COMPLEX)
1061    ptrdiff_t     temp;
1062    PetscInt      N1,tot_dim;
1063 #else
1064    PetscInt      n1;
1065 #endif
1066 
1067   PetscFunctionBegin;
1068   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
1069   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1070 
1071   fftw_mpi_init();
1072   pdim = (ptrdiff_t *)calloc(ndim,sizeof(ptrdiff_t));
1073   pdim[0] = dim[0];
1074 #if !defined(PETSC_USE_COMPLEX)
1075   tot_dim = 2*dim[0];
1076 #endif
1077   for (ctr=1;ctr<ndim;ctr++){
1078     partial_dim *= dim[ctr];
1079     pdim[ctr] = dim[ctr];
1080 #if !defined(PETSC_USE_COMPLEX)
1081     if (ctr==ndim-1)
1082       tot_dim *= (dim[ctr]/2+1);
1083     else
1084       tot_dim *= dim[ctr];
1085 #endif
1086   }
1087 
1088   if (size == 1) {
1089 #if defined(PETSC_USE_COMPLEX)
1090     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1091     n = N;
1092 #else
1093     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1094     n = tot_dim;
1095 #endif
1096 
1097   } else {
1098     ptrdiff_t alloc_local,local_n0,local_0_start;
1099     switch (ndim){
1100     case 1:
1101 #if !defined(PETSC_USE_COMPLEX)
1102       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1103 #else
1104       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1105       n = (PetscInt)local_n0;
1106       n1 = (PetscInt) local_n1;
1107       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1108 #endif
1109       break;
1110     case 2:
1111 #if defined(PETSC_USE_COMPLEX)
1112       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1113       /*
1114        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1115        PetscSynchronizedFlush(comm);
1116        */
1117       n = (PetscInt)local_n0*dim[1];
1118       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1119 #else
1120       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,PETSC_COMM_WORLD,&local_n0,&local_0_start,&local_n1,&local_1_start);
1121       n = 2*(PetscInt)local_n0*(dim[1]/2+1);
1122       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1123 #endif
1124       break;
1125     case 3:
1126 #if defined(PETSC_USE_COMPLEX)
1127       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1128       n = (PetscInt)local_n0*dim[1]*dim[2];
1129       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1130 #else
1131       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,PETSC_COMM_WORLD,&local_n0,&local_0_start,&local_n1,&local_1_start);
1132       n = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1133       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1134 #endif
1135       break;
1136     default:
1137 #if defined(PETSC_USE_COMPLEX)
1138       alloc_local = fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1139       n = (PetscInt)local_n0*partial_dim;
1140       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1141 #else
1142       temp = pdim[ndim-1];
1143       pdim[ndim-1] = temp/2 + 1;
1144       alloc_local = fftw_mpi_local_size_transposed(ndim,pdim,PETSC_COMM_WORLD,&local_n0,&local_0_start,&local_n1,&local_1_start);
1145       n = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1146       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1147       pdim[ndim-1] = temp;
1148       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1149 #endif
1150       break;
1151     }
1152   }
1153   ierr = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1154   ierr = PetscNewLog(A,Mat_FFTW,&fftw);CHKERRQ(ierr);
1155   fft->data = (void*)fftw;
1156 
1157   fft->n            = n;
1158   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1159   fftw->partial_dim = partial_dim;
1160   ierr = PetscMalloc(ndim*sizeof(ptrdiff_t), (ptrdiff_t *)&(fftw->dim_fftw));CHKERRQ(ierr);
1161   for(ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1162 
1163   fftw->p_forward  = 0;
1164   fftw->p_backward = 0;
1165   fftw->p_flag     = FFTW_ESTIMATE;
1166 
1167   if (size == 1){
1168     A->ops->mult          = MatMult_SeqFFTW;
1169     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1170   } else {
1171     A->ops->mult          = MatMult_MPIFFTW;
1172     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1173   }
1174   fft->matdestroy          = MatDestroy_FFTW;
1175   A->assembled          = PETSC_TRUE;
1176   PetscObjectComposeFunctionDynamic((PetscObject)A,"MatGetVecsFFTW_C","MatGetVecsFFTW_FFTW",MatGetVecsFFTW_FFTW);
1177   PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterPetscToFFTW_C","VecScatterPetscToFFTW_FFTW",VecScatterPetscToFFTW_FFTW);
1178   PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterFFTWToPetsc_C","VecScatterFFTWToPetsc_FFTW",VecScatterFFTWToPetsc_FFTW);
1179 
1180   /* get runtime options */
1181   ierr = PetscOptionsBegin(((PetscObject)A)->comm,((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1182     ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",p_flags,4,p_flags[0],&p_flag,&flg);CHKERRQ(ierr);
1183     if (flg) {fftw->p_flag = (unsigned)p_flag;}
1184   PetscOptionsEnd();
1185   PetscFunctionReturn(0);
1186 }
1187 EXTERN_C_END
1188 
1189 
1190 
1191 
1192