xref: /petsc/src/mat/impls/fft/fftw/fftw.c (revision ce94432eddcd14845bc7e8083b7f8ea723b9bf7d)
1 
2 /*
3     Provides an interface to the FFTW package.
4     Testing examples can be found in ~src/mat/examples/tests
5 */
6 
7 #include <../src/mat/impls/fft/fft.h>   /*I "petscmat.h" I*/
8 EXTERN_C_BEGIN
9 #include <fftw3-mpi.h>
10 EXTERN_C_END
11 
12 typedef struct {
13   ptrdiff_t   ndim_fftw,*dim_fftw;
14   PetscInt    partial_dim;
15   fftw_plan   p_forward,p_backward;
16   unsigned    p_flag; /* planner flags, FFTW_ESTIMATE,FFTW_MEASURE, FFTW_PATIENT, FFTW_EXHAUSTIVE */
17   PetscScalar *finarray,*foutarray,*binarray,*boutarray; /* keep track of arrays becaue fftw plan should be
18                                                             executed for the arrays with which the plan was created */
19 } Mat_FFTW;
20 
21 extern PetscErrorCode MatMult_SeqFFTW(Mat,Vec,Vec);
22 extern PetscErrorCode MatMultTranspose_SeqFFTW(Mat,Vec,Vec);
23 extern PetscErrorCode MatMult_MPIFFTW(Mat,Vec,Vec);
24 extern PetscErrorCode MatMultTranspose_MPIFFTW(Mat,Vec,Vec);
25 extern PetscErrorCode MatDestroy_FFTW(Mat);
26 extern PetscErrorCode VecDestroy_MPIFFTW(Vec);
27 
28 /* MatMult_SeqFFTW performs forward DFT in parallel
29    Input parameter:
30      A - the matrix
31      x - the vector on which FDFT will be performed
32 
33    Output parameter:
34      y - vector that stores result of FDFT
35 */
36 #undef __FUNCT__
37 #define __FUNCT__ "MatMult_SeqFFTW"
38 PetscErrorCode MatMult_SeqFFTW(Mat A,Vec x,Vec y)
39 {
40   PetscErrorCode ierr;
41   Mat_FFT        *fft  = (Mat_FFT*)A->data;
42   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
43   PetscScalar    *x_array,*y_array;
44   PetscInt       ndim=fft->ndim,*dim=fft->dim;
45 
46   PetscFunctionBegin;
47   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
48   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
49   if (!fftw->p_forward) { /* create a plan, then excute it */
50     switch (ndim) {
51     case 1:
52 #if defined(PETSC_USE_COMPLEX)
53       fftw->p_forward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
54 #else
55       fftw->p_forward = fftw_plan_dft_r2c_1d(dim[0],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
56 #endif
57       break;
58     case 2:
59 #if defined(PETSC_USE_COMPLEX)
60       fftw->p_forward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
61 #else
62       fftw->p_forward = fftw_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
63 #endif
64       break;
65     case 3:
66 #if defined(PETSC_USE_COMPLEX)
67       fftw->p_forward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
68 #else
69       fftw->p_forward = fftw_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
70 #endif
71       break;
72     default:
73 #if defined(PETSC_USE_COMPLEX)
74       fftw->p_forward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_FORWARD,fftw->p_flag);
75 #else
76       fftw->p_forward = fftw_plan_dft_r2c(ndim,dim,(double*)x_array,(fftw_complex*)y_array,fftw->p_flag);
77 #endif
78       break;
79     }
80     fftw->finarray  = x_array;
81     fftw->foutarray = y_array;
82     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
83                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
84     fftw_execute(fftw->p_forward);
85   } else { /* use existing plan */
86     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
87       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
88     } else {
89       fftw_execute(fftw->p_forward);
90     }
91   }
92   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
93   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
94   PetscFunctionReturn(0);
95 }
96 
97 /* MatMultTranspose_SeqFFTW performs serial backward DFT
98    Input parameter:
99      A - the matrix
100      x - the vector on which BDFT will be performed
101 
102    Output parameter:
103      y - vector that stores result of BDFT
104 */
105 
106 #undef __FUNCT__
107 #define __FUNCT__ "MatMultTranspose_SeqFFTW"
108 PetscErrorCode MatMultTranspose_SeqFFTW(Mat A,Vec x,Vec y)
109 {
110   PetscErrorCode ierr;
111   Mat_FFT        *fft  = (Mat_FFT*)A->data;
112   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
113   PetscScalar    *x_array,*y_array;
114   PetscInt       ndim=fft->ndim,*dim=fft->dim;
115 
116   PetscFunctionBegin;
117   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
118   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
119   if (!fftw->p_backward) { /* create a plan, then excute it */
120     switch (ndim) {
121     case 1:
122 #if defined(PETSC_USE_COMPLEX)
123       fftw->p_backward = fftw_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
124 #else
125       fftw->p_backward= fftw_plan_dft_c2r_1d(dim[0],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
126 #endif
127       break;
128     case 2:
129 #if defined(PETSC_USE_COMPLEX)
130       fftw->p_backward = fftw_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
131 #else
132       fftw->p_backward= fftw_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
133 #endif
134       break;
135     case 3:
136 #if defined(PETSC_USE_COMPLEX)
137       fftw->p_backward = fftw_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
138 #else
139       fftw->p_backward= fftw_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
140 #endif
141       break;
142     default:
143 #if defined(PETSC_USE_COMPLEX)
144       fftw->p_backward = fftw_plan_dft(ndim,dim,(fftw_complex*)x_array,(fftw_complex*)y_array,FFTW_BACKWARD,fftw->p_flag);
145 #else
146       fftw->p_backward= fftw_plan_dft_c2r(ndim,dim,(fftw_complex*)x_array,(double*)y_array,fftw->p_flag);
147 #endif
148       break;
149     }
150     fftw->binarray  = x_array;
151     fftw->boutarray = y_array;
152     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
153   } else { /* use existing plan */
154     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
155       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
156     } else {
157       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
158     }
159   }
160   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
161   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
162   PetscFunctionReturn(0);
163 }
164 
165 /* MatMult_MPIFFTW performs forward DFT in parallel
166    Input parameter:
167      A - the matrix
168      x - the vector on which FDFT will be performed
169 
170    Output parameter:
171    y   - vector that stores result of FDFT
172 */
173 #undef __FUNCT__
174 #define __FUNCT__ "MatMult_MPIFFTW"
175 PetscErrorCode MatMult_MPIFFTW(Mat A,Vec x,Vec y)
176 {
177   PetscErrorCode ierr;
178   Mat_FFT        *fft  = (Mat_FFT*)A->data;
179   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
180   PetscScalar    *x_array,*y_array;
181   PetscInt       ndim=fft->ndim,*dim=fft->dim;
182   MPI_Comm       comm;
183 
184   PetscFunctionBegin;
185   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
186   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
187   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
188   if (!fftw->p_forward) { /* create a plan, then excute it */
189     switch (ndim) {
190     case 1:
191 #if defined(PETSC_USE_COMPLEX)
192       fftw->p_forward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
193 #else
194       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
195 #endif
196       break;
197     case 2:
198 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft, for real transforms call fftw_mpi_plan_dft_r2c */
199       fftw->p_forward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
200 #else
201       fftw->p_forward = fftw_mpi_plan_dft_r2c_2d(dim[0],dim[1],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
202 #endif
203       break;
204     case 3:
205 #if defined(PETSC_USE_COMPLEX)
206       fftw->p_forward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
207 #else
208       fftw->p_forward = fftw_mpi_plan_dft_r2c_3d(dim[0],dim[1],dim[2],(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
209 #endif
210       break;
211     default:
212 #if defined(PETSC_USE_COMPLEX)
213       fftw->p_forward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_FORWARD,fftw->p_flag);
214 #else
215       fftw->p_forward = fftw_mpi_plan_dft_r2c(fftw->ndim_fftw,fftw->dim_fftw,(double*)x_array,(fftw_complex*)y_array,comm,FFTW_ESTIMATE);
216 #endif
217       break;
218     }
219     fftw->finarray  = x_array;
220     fftw->foutarray = y_array;
221     /* Warning: if (fftw->p_flag!==FFTW_ESTIMATE) The data in the in/out arrays is overwritten!
222                 planning should be done before x is initialized! See FFTW manual sec2.1 or sec4 */
223     fftw_execute(fftw->p_forward);
224   } else { /* use existing plan */
225     if (fftw->finarray != x_array || fftw->foutarray != y_array) { /* use existing plan on new arrays */
226       fftw_execute_dft(fftw->p_forward,(fftw_complex*)x_array,(fftw_complex*)y_array);
227     } else {
228       fftw_execute(fftw->p_forward);
229     }
230   }
231   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
232   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
233   PetscFunctionReturn(0);
234 }
235 
236 /* MatMultTranspose_MPIFFTW performs parallel backward DFT
237    Input parameter:
238      A - the matrix
239      x - the vector on which BDFT will be performed
240 
241    Output parameter:
242      y - vector that stores result of BDFT
243 */
244 #undef __FUNCT__
245 #define __FUNCT__ "MatMultTranspose_MPIFFTW"
246 PetscErrorCode MatMultTranspose_MPIFFTW(Mat A,Vec x,Vec y)
247 {
248   PetscErrorCode ierr;
249   Mat_FFT        *fft  = (Mat_FFT*)A->data;
250   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
251   PetscScalar    *x_array,*y_array;
252   PetscInt       ndim=fft->ndim,*dim=fft->dim;
253   MPI_Comm       comm;
254 
255   PetscFunctionBegin;
256   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
257   ierr = VecGetArray(x,&x_array);CHKERRQ(ierr);
258   ierr = VecGetArray(y,&y_array);CHKERRQ(ierr);
259   if (!fftw->p_backward) { /* create a plan, then excute it */
260     switch (ndim) {
261     case 1:
262 #if defined(PETSC_USE_COMPLEX)
263       fftw->p_backward = fftw_mpi_plan_dft_1d(dim[0],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
264 #else
265       SETERRQ(comm,PETSC_ERR_SUP,"not support for real numbers yet");
266 #endif
267       break;
268     case 2:
269 #if defined(PETSC_USE_COMPLEX) /* For complex transforms call fftw_mpi_plan_dft with flag FFTW_BACKWARD, for real transforms call fftw_mpi_plan_dft_c2r */
270       fftw->p_backward = fftw_mpi_plan_dft_2d(dim[0],dim[1],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
271 #else
272       fftw->p_backward = fftw_mpi_plan_dft_c2r_2d(dim[0],dim[1],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
273 #endif
274       break;
275     case 3:
276 #if defined(PETSC_USE_COMPLEX)
277       fftw->p_backward = fftw_mpi_plan_dft_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
278 #else
279       fftw->p_backward = fftw_mpi_plan_dft_c2r_3d(dim[0],dim[1],dim[2],(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
280 #endif
281       break;
282     default:
283 #if defined(PETSC_USE_COMPLEX)
284       fftw->p_backward = fftw_mpi_plan_dft(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(fftw_complex*)y_array,comm,FFTW_BACKWARD,fftw->p_flag);
285 #else
286       fftw->p_backward = fftw_mpi_plan_dft_c2r(fftw->ndim_fftw,fftw->dim_fftw,(fftw_complex*)x_array,(double*)y_array,comm,FFTW_ESTIMATE);
287 #endif
288       break;
289     }
290     fftw->binarray  = x_array;
291     fftw->boutarray = y_array;
292     fftw_execute(fftw->p_backward);CHKERRQ(ierr);
293   } else { /* use existing plan */
294     if (fftw->binarray != x_array || fftw->boutarray != y_array) { /* use existing plan on new arrays */
295       fftw_execute_dft(fftw->p_backward,(fftw_complex*)x_array,(fftw_complex*)y_array);
296     } else {
297       fftw_execute(fftw->p_backward);CHKERRQ(ierr);
298     }
299   }
300   ierr = VecRestoreArray(y,&y_array);CHKERRQ(ierr);
301   ierr = VecRestoreArray(x,&x_array);CHKERRQ(ierr);
302   PetscFunctionReturn(0);
303 }
304 
305 #undef __FUNCT__
306 #define __FUNCT__ "MatDestroy_FFTW"
307 PetscErrorCode MatDestroy_FFTW(Mat A)
308 {
309   Mat_FFT        *fft  = (Mat_FFT*)A->data;
310   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
311   PetscErrorCode ierr;
312 
313   PetscFunctionBegin;
314   fftw_destroy_plan(fftw->p_forward);
315   fftw_destroy_plan(fftw->p_backward);
316   ierr = PetscFree(fftw->dim_fftw);CHKERRQ(ierr);
317   ierr = PetscFree(fft->data);CHKERRQ(ierr);
318   fftw_mpi_cleanup();
319   PetscFunctionReturn(0);
320 }
321 
322 #include <../src/vec/vec/impls/mpi/pvecimpl.h>   /*I  "petscvec.h"   I*/
323 #undef __FUNCT__
324 #define __FUNCT__ "VecDestroy_MPIFFTW"
325 PetscErrorCode VecDestroy_MPIFFTW(Vec v)
326 {
327   PetscErrorCode ierr;
328   PetscScalar    *array;
329 
330   PetscFunctionBegin;
331   ierr = VecGetArray(v,&array);CHKERRQ(ierr);
332   fftw_free((fftw_complex*)array);CHKERRQ(ierr);
333   ierr = VecRestoreArray(v,&array);CHKERRQ(ierr);
334   ierr = VecDestroy_MPI(v);CHKERRQ(ierr);
335   PetscFunctionReturn(0);
336 }
337 
338 #undef __FUNCT__
339 #define __FUNCT__ "MatGetVecsFFTW"
340 /*@
341    MatGetVecsFFTW - Get vector(s) compatible with the matrix, i.e. with the
342      parallel layout determined by FFTW
343 
344    Collective on Mat
345 
346    Input Parameter:
347 .   A - the matrix
348 
349    Output Parameter:
350 +   x - (optional) input vector of forward FFTW
351 -   y - (optional) output vector of forward FFTW
352 -   z - (optional) output vector of backward FFTW
353 
354   Level: advanced
355 
356   Note: The parallel layout of output of forward FFTW is always same as the input
357         of the backward FFTW. But parallel layout of the input vector of forward
358         FFTW might not be same as the output of backward FFTW.
359         Also note that we need to provide enough space while doing parallel real transform.
360         We need to pad extra zeros at the end of last dimension. For this reason the one needs to
361         invoke the routine fftw_mpi_local_size_transposed routines. Remember one has to change the
362         last dimension from n to n/2+1 while invoking this routine. The number of zeros to be padded
363         depends on if the last dimension is even or odd. If the last dimension is even need to pad two
364         zeros if it is odd only one zero is needed.
365         Lastly one needs some scratch space at the end of data set in each process. alloc_local
366         figures out how much space is needed, i.e. it figures out the data+scratch space for
367         each processor and returns that.
368 
369 .seealso: MatCreateFFTW()
370 @*/
371 PetscErrorCode MatGetVecsFFTW(Mat A,Vec *x,Vec *y,Vec *z)
372 {
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   ierr = PetscTryMethod(A,"MatGetVecsFFTW_C",(Mat,Vec*,Vec*,Vec*),(A,x,y,z));CHKERRQ(ierr);
377   PetscFunctionReturn(0);
378 }
379 
380 EXTERN_C_BEGIN
381 #undef __FUNCT__
382 #define __FUNCT__ "MatGetVecsFFTW_FFTW"
383 PetscErrorCode  MatGetVecsFFTW_FFTW(Mat A,Vec *fin,Vec *fout,Vec *bout)
384 {
385   PetscErrorCode ierr;
386   PetscMPIInt    size,rank;
387   MPI_Comm       comm;
388   Mat_FFT        *fft  = (Mat_FFT*)A->data;
389   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
390   PetscInt       N     = fft->N;
391   PetscInt       ndim  = fft->ndim,*dim=fft->dim,n=fft->n;
392 
393   PetscFunctionBegin;
394   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
395   PetscValidType(A,1);
396   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
397 
398   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
399   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
400   if (size == 1) { /* sequential case */
401 #if defined(PETSC_USE_COMPLEX)
402     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fin);CHKERRQ(ierr);}
403     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,fout);CHKERRQ(ierr);}
404     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,N,bout);CHKERRQ(ierr);}
405 #else
406     if (fin) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fin);CHKERRQ(ierr);}
407     if (fout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,fout);CHKERRQ(ierr);}
408     if (bout) {ierr = VecCreateSeq(PETSC_COMM_SELF,n,bout);CHKERRQ(ierr);}
409 #endif
410   } else { /* parallel cases */
411     ptrdiff_t    alloc_local,local_n0,local_0_start;
412     ptrdiff_t    local_n1;
413     fftw_complex *data_fout;
414     ptrdiff_t    local_1_start;
415 #if defined(PETSC_USE_COMPLEX)
416     fftw_complex *data_fin,*data_bout;
417 #else
418     double    *data_finr,*data_boutr;
419     PetscInt  n1,N1;
420     ptrdiff_t temp;
421 #endif
422 
423     switch (ndim) {
424     case 1:
425 #if !defined(PETSC_USE_COMPLEX)
426       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not allow parallel real 1D transform");
427 #else
428       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
429       if (fin) {
430         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
431         ierr      = VecCreateMPIWithArray(comm,1,local_n0,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
432 
433         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
434       }
435       if (fout) {
436         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
437         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
438 
439         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
440       }
441       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
442       if (bout) {
443         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
444         ierr      = VecCreateMPIWithArray(comm,1,local_n1,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
445 
446         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
447       }
448       break;
449 #endif
450     case 2:
451 #if !defined(PETSC_USE_COMPLEX) /* Note that N1 is no more the product of individual dimensions */
452       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
453       N1          = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
454       if (fin) {
455         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
456         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
457 
458         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
459       }
460       if (bout) {
461         data_boutr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
462         ierr       = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
463 
464         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
465       }
466       if (fout) {
467         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
468         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
469 
470         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
471       }
472 #else
473       /* Get local size */
474       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
475       if (fin) {
476         data_fin = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
477         ierr     = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
478 
479         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
480       }
481       if (fout) {
482         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
483         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
484 
485         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
486       }
487       if (bout) {
488         data_bout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
489         ierr      = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
490 
491         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
492       }
493 #endif
494       break;
495     case 3:
496 #if !defined(PETSC_USE_COMPLEX)
497       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
498       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
499       if (fin) {
500         data_finr = (double*)fftw_malloc(sizeof(double)*alloc_local*2);
501         ierr      = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
502 
503         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
504       }
505       if (bout) {
506         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
507         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n1,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
508 
509         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
510       }
511 
512       if (fout) {
513         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
514         ierr = VecCreateMPIWithArray(comm,1,n1,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
515 
516         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
517       }
518 #else
519       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
520       if (fin) {
521         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
522         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
523 
524         (*fin)->ops->destroy   = VecDestroy_MPIFFTW;
525       }
526       if (fout) {
527         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
528         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
529 
530         (*fout)->ops->destroy   = VecDestroy_MPIFFTW;
531       }
532       if (bout) {
533         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
534         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
535 
536         (*bout)->ops->destroy   = VecDestroy_MPIFFTW;
537       }
538 #endif
539       break;
540     default:
541 #if !defined(PETSC_USE_COMPLEX)
542       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
543 
544       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
545 
546       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
547       N1          = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
548 
549       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
550 
551       if (fin) {
552         data_finr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
553         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_finr,fin);CHKERRQ(ierr);
554 
555         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
556       }
557       if (bout) {
558         data_boutr=(double*)fftw_malloc(sizeof(double)*alloc_local*2);
559         ierr = VecCreateMPIWithArray(comm,1,(PetscInt)n,N1,(PetscScalar*)data_boutr,bout);CHKERRQ(ierr);
560 
561         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
562       }
563 
564       if (fout) {
565         data_fout=(fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
566         ierr = VecCreateMPIWithArray(comm,1,n,N1,(PetscScalar*)data_fout,fout);CHKERRQ(ierr);
567 
568         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
569       }
570 #else
571       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
572       if (fin) {
573         data_fin  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
574         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fin,fin);CHKERRQ(ierr);
575 
576         (*fin)->ops->destroy = VecDestroy_MPIFFTW;
577       }
578       if (fout) {
579         data_fout = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
580         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_fout,fout);CHKERRQ(ierr);
581 
582         (*fout)->ops->destroy = VecDestroy_MPIFFTW;
583       }
584       if (bout) {
585         data_bout  = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*alloc_local);
586         ierr = VecCreateMPIWithArray(comm,1,n,N,(const PetscScalar*)data_bout,bout);CHKERRQ(ierr);
587 
588         (*bout)->ops->destroy = VecDestroy_MPIFFTW;
589       }
590 #endif
591       break;
592     }
593   }
594   PetscFunctionReturn(0);
595 }
596 EXTERN_C_END
597 
598 #undef __FUNCT__
599 #define __FUNCT__ "VecScatterPetscToFFTW"
600 /*@
601    VecScatterPetscToFFTW - Copies the PETSc vector to the vector that goes into FFTW block.
602 
603    Collective on Mat
604 
605    Input Parameters:
606 +  A - FFTW matrix
607 -  x - the PETSc vector
608 
609    Output Parameters:
610 .  y - the FFTW vector
611 
612   Options Database Keys:
613 . -mat_fftw_plannerflags - set FFTW planner flags
614 
615    Level: intermediate
616 
617    Note: For real parallel FFT, FFTW requires insertion of extra space at the end of last dimension. This required even when
618          one is not doing in-place transform. The last dimension size must be changed to 2*(dim[last]/2+1) to accommodate these extra
619          zeros. This routine does that job by scattering operation.
620 
621 .seealso: VecScatterFFTWToPetsc()
622 @*/
623 PetscErrorCode VecScatterPetscToFFTW(Mat A,Vec x,Vec y)
624 {
625   PetscErrorCode ierr;
626 
627   PetscFunctionBegin;
628   ierr = PetscTryMethod(A,"VecScatterPetscToFFTW_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
629   PetscFunctionReturn(0);
630 }
631 
632 EXTERN_C_BEGIN
633 #undef __FUNCT__
634 #define __FUNCT__ "VecScatterPetscToFFTW_FFTW"
635 PetscErrorCode VecScatterPetscToFFTW_FFTW(Mat A,Vec x,Vec y)
636 {
637   PetscErrorCode ierr;
638   MPI_Comm       comm;
639   Mat_FFT        *fft  = (Mat_FFT*)A->data;
640   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
641   PetscInt       N     =fft->N;
642   PetscInt       ndim  =fft->ndim,*dim=fft->dim;
643   PetscInt       low;
644   PetscInt       rank,size,vsize,vsize1;
645   ptrdiff_t      alloc_local,local_n0,local_0_start;
646   ptrdiff_t      local_n1,local_1_start;
647   VecScatter     vecscat;
648   IS             list1,list2;
649 #if !defined(PETSC_USE_COMPLEX)
650   PetscInt       i,j,k,partial_dim;
651   PetscInt       *indx1, *indx2, tempindx, tempindx1;
652   PetscInt       N1, n1,NM;
653   ptrdiff_t      temp;
654 #endif
655 
656   PetscFunctionBegin;
657   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
658   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
659   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
660   ierr = VecGetOwnershipRange(y,&low,NULL);
661 
662   if (size==1) {
663     ierr = VecGetSize(x,&vsize);CHKERRQ(ierr);
664     ierr = VecGetSize(y,&vsize1);CHKERRQ(ierr);
665     ierr = ISCreateStride(PETSC_COMM_SELF,N,0,1,&list1);CHKERRQ(ierr);
666     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
667     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
668     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
669     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
670     ierr = ISDestroy(&list1);CHKERRQ(ierr);
671   } else {
672     switch (ndim) {
673     case 1:
674 #if defined(PETSC_USE_COMPLEX)
675       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
676 
677       ierr = ISCreateStride(comm,local_n0,local_0_start,1,&list1);
678       ierr = ISCreateStride(comm,local_n0,low,1,&list2);
679       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
680       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
681       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
682       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
683       ierr = ISDestroy(&list1);CHKERRQ(ierr);
684       ierr = ISDestroy(&list2);CHKERRQ(ierr);
685 #else
686       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
687 #endif
688       break;
689     case 2:
690 #if defined(PETSC_USE_COMPLEX)
691       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
692 
693       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
694       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
695       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
696       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
697       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
698       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
699       ierr = ISDestroy(&list1);CHKERRQ(ierr);
700       ierr = ISDestroy(&list2);CHKERRQ(ierr);
701 #else
702       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
703 
704       N1   = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
705       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
706       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
707 
708       if (dim[1]%2==0) {
709         NM = dim[1]+2;
710       } else {
711         NM = dim[1]+1;
712       }
713       for (i=0; i<local_n0; i++) {
714         for (j=0; j<dim[1]; j++) {
715           tempindx  = i*dim[1] + j;
716           tempindx1 = i*NM + j;
717 
718           indx1[tempindx]=local_0_start*dim[1]+tempindx;
719           indx2[tempindx]=low+tempindx1;
720         }
721       }
722 
723       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
724       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
725 
726       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
727       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
728       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
729       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
730       ierr = ISDestroy(&list1);CHKERRQ(ierr);
731       ierr = ISDestroy(&list2);CHKERRQ(ierr);
732       ierr = PetscFree(indx1);CHKERRQ(ierr);
733       ierr = PetscFree(indx2);CHKERRQ(ierr);
734 #endif
735       break;
736 
737     case 3:
738 #if defined(PETSC_USE_COMPLEX)
739       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
740 
741       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
742       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
743       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
744       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
745       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
746       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
747       ierr = ISDestroy(&list1);CHKERRQ(ierr);
748       ierr = ISDestroy(&list2);CHKERRQ(ierr);
749 #else
750       alloc_local =  fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
751 
752       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
753 
754       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
755       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
756 
757       if (dim[2]%2==0) NM = dim[2]+2;
758       else             NM = dim[2]+1;
759 
760       for (i=0; i<local_n0; i++) {
761         for (j=0; j<dim[1]; j++) {
762           for (k=0; k<dim[2]; k++) {
763             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
764             tempindx1 = i*dim[1]*NM + j*NM + k;
765 
766             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
767             indx2[tempindx]=low+tempindx1;
768           }
769         }
770       }
771 
772       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
773       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
774       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
775       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
776       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
777       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
778       ierr = ISDestroy(&list1);CHKERRQ(ierr);
779       ierr = ISDestroy(&list2);CHKERRQ(ierr);
780       ierr = PetscFree(indx1);CHKERRQ(ierr);
781       ierr = PetscFree(indx2);CHKERRQ(ierr);
782 #endif
783       break;
784 
785     default:
786 #if defined(PETSC_USE_COMPLEX)
787       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
788 
789       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
790       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
791       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
792       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
793       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
794       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
795       ierr = ISDestroy(&list1);CHKERRQ(ierr);
796       ierr = ISDestroy(&list2);CHKERRQ(ierr);
797 #else
798       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
799 
800       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
801 
802       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
803 
804       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
805 
806       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
807 
808       partial_dim = fftw->partial_dim;
809 
810       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
811       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
812 
813       if (dim[ndim-1]%2==0) NM = 2;
814       else                  NM = 1;
815 
816       j = low;
817       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim;i++,k++) {
818         indx1[i] = local_0_start*partial_dim + i;
819         indx2[i] = j;
820         if (k%dim[ndim-1]==0) j+=NM;
821         j++;
822       }
823       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
824       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
825       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
826       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
827       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
828       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
829       ierr = ISDestroy(&list1);CHKERRQ(ierr);
830       ierr = ISDestroy(&list2);CHKERRQ(ierr);
831       ierr = PetscFree(indx1);CHKERRQ(ierr);
832       ierr = PetscFree(indx2);CHKERRQ(ierr);
833 #endif
834       break;
835     }
836   }
837   PetscFunctionReturn(0);
838 }
839 EXTERN_C_END
840 
841 #undef __FUNCT__
842 #define __FUNCT__ "VecScatterFFTWToPetsc"
843 /*@
844    VecScatterFFTWToPetsc - Converts FFTW output to the PETSc vector.
845 
846    Collective on Mat
847 
848     Input Parameters:
849 +   A - FFTW matrix
850 -   x - FFTW vector
851 
852    Output Parameters:
853 .  y - PETSc vector
854 
855    Level: intermediate
856 
857    Note: While doing real transform the FFTW output of backward DFT contains extra zeros at the end of last dimension.
858          VecScatterFFTWToPetsc removes those extra zeros.
859 
860 .seealso: VecScatterPetscToFFTW()
861 @*/
862 PetscErrorCode VecScatterFFTWToPetsc(Mat A,Vec x,Vec y)
863 {
864   PetscErrorCode ierr;
865 
866   PetscFunctionBegin;
867   ierr = PetscTryMethod(A,"VecScatterFFTWToPetsc_C",(Mat,Vec,Vec),(A,x,y));CHKERRQ(ierr);
868   PetscFunctionReturn(0);
869 }
870 
871 EXTERN_C_BEGIN
872 #undef __FUNCT__
873 #define __FUNCT__ "VecScatterFFTWToPetsc_FFTW"
874 PetscErrorCode VecScatterFFTWToPetsc_FFTW(Mat A,Vec x,Vec y)
875 {
876   PetscErrorCode ierr;
877   MPI_Comm       comm;
878   Mat_FFT        *fft  = (Mat_FFT*)A->data;
879   Mat_FFTW       *fftw = (Mat_FFTW*)fft->data;
880   PetscInt       N     = fft->N;
881   PetscInt       ndim  = fft->ndim,*dim=fft->dim;
882   PetscInt       low;
883   PetscInt       rank,size;
884   ptrdiff_t      alloc_local,local_n0,local_0_start;
885   ptrdiff_t      local_n1,local_1_start;
886   VecScatter     vecscat;
887   IS             list1,list2;
888 #if !defined(PETSC_USE_COMPLEX)
889   PetscInt  i,j,k,partial_dim;
890   PetscInt  *indx1, *indx2, tempindx, tempindx1;
891   PetscInt  N1, n1,NM;
892   ptrdiff_t temp;
893 #endif
894 
895   PetscFunctionBegin;
896   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
897   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
898   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
899   ierr = VecGetOwnershipRange(x,&low,NULL);CHKERRQ(ierr);
900 
901   if (size==1) {
902     ierr = ISCreateStride(comm,N,0,1,&list1);CHKERRQ(ierr);
903     ierr = VecScatterCreate(x,list1,y,list1,&vecscat);CHKERRQ(ierr);
904     ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
905     ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
906     ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
907     ierr = ISDestroy(&list1);CHKERRQ(ierr);
908 
909   } else {
910     switch (ndim) {
911     case 1:
912 #if defined(PETSC_USE_COMPLEX)
913       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_BACKWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
914 
915       ierr = ISCreateStride(comm,local_n1,local_1_start,1,&list1);
916       ierr = ISCreateStride(comm,local_n1,low,1,&list2);
917       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
918       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
919       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
920       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
921       ierr = ISDestroy(&list1);CHKERRQ(ierr);
922       ierr = ISDestroy(&list2);CHKERRQ(ierr);
923 #else
924       SETERRQ(comm,PETSC_ERR_SUP,"No support for real parallel 1D FFT");
925 #endif
926       break;
927     case 2:
928 #if defined(PETSC_USE_COMPLEX)
929       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
930 
931       ierr = ISCreateStride(comm,local_n0*dim[1],local_0_start*dim[1],1,&list1);
932       ierr = ISCreateStride(comm,local_n0*dim[1],low,1,&list2);
933       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
934       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
935       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
936       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
937       ierr = ISDestroy(&list1);CHKERRQ(ierr);
938       ierr = ISDestroy(&list2);CHKERRQ(ierr);
939 #else
940       alloc_local =  fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
941 
942       N1 = 2*dim[0]*(dim[1]/2+1); n1 = 2*local_n0*(dim[1]/2+1);
943 
944       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx1);CHKERRQ(ierr);
945       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1],&indx2);CHKERRQ(ierr);
946 
947       if (dim[1]%2==0) NM = dim[1]+2;
948       else             NM = dim[1]+1;
949 
950       for (i=0; i<local_n0; i++) {
951         for (j=0; j<dim[1]; j++) {
952           tempindx = i*dim[1] + j;
953           tempindx1 = i*NM + j;
954 
955           indx1[tempindx]=local_0_start*dim[1]+tempindx;
956           indx2[tempindx]=low+tempindx1;
957         }
958       }
959 
960       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
961       ierr = ISCreateGeneral(comm,local_n0*dim[1],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
962 
963       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
964       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
965       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
966       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
967       ierr = ISDestroy(&list1);CHKERRQ(ierr);
968       ierr = ISDestroy(&list2);CHKERRQ(ierr);
969       ierr = PetscFree(indx1);CHKERRQ(ierr);
970       ierr = PetscFree(indx2);CHKERRQ(ierr);
971 #endif
972       break;
973     case 3:
974 #if defined(PETSC_USE_COMPLEX)
975       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
976 
977       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],local_0_start*dim[1]*dim[2],1,&list1);
978       ierr = ISCreateStride(comm,local_n0*dim[1]*dim[2],low,1,&list2);
979       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
980       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
981       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
982       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
983       ierr = ISDestroy(&list1);CHKERRQ(ierr);
984       ierr = ISDestroy(&list2);CHKERRQ(ierr);
985 #else
986 
987       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
988 
989       N1 = 2*dim[0]*dim[1]*(dim[2]/2+1); n1 = 2*local_n0*dim[1]*(dim[2]/2+1);
990 
991       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx1);CHKERRQ(ierr);
992       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*dim[1]*dim[2],&indx2);CHKERRQ(ierr);
993 
994       if (dim[2]%2==0) NM = dim[2]+2;
995       else             NM = dim[2]+1;
996 
997       for (i=0; i<local_n0; i++) {
998         for (j=0; j<dim[1]; j++) {
999           for (k=0; k<dim[2]; k++) {
1000             tempindx  = i*dim[1]*dim[2] + j*dim[2] + k;
1001             tempindx1 = i*dim[1]*NM + j*NM + k;
1002 
1003             indx1[tempindx]=local_0_start*dim[1]*dim[2]+tempindx;
1004             indx2[tempindx]=low+tempindx1;
1005           }
1006         }
1007       }
1008 
1009       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1010       ierr = ISCreateGeneral(comm,local_n0*dim[1]*dim[2],indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1011 
1012       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1013       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1014       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1015       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1016       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1017       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1018       ierr = PetscFree(indx1);CHKERRQ(ierr);
1019       ierr = PetscFree(indx2);CHKERRQ(ierr);
1020 #endif
1021       break;
1022     default:
1023 #if defined(PETSC_USE_COMPLEX)
1024       alloc_local = fftw_mpi_local_size(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start);
1025 
1026       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),local_0_start*(fftw->partial_dim),1,&list1);
1027       ierr = ISCreateStride(comm,local_n0*(fftw->partial_dim),low,1,&list2);
1028       ierr = VecScatterCreate(x,list1,y,list2,&vecscat);CHKERRQ(ierr);
1029       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1030       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1031       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1032       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1033       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1034 #else
1035       temp = (fftw->dim_fftw)[fftw->ndim_fftw-1];
1036 
1037       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp/2 + 1;
1038 
1039       alloc_local = fftw_mpi_local_size_transposed(fftw->ndim_fftw,fftw->dim_fftw,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1040 
1041       N1 = 2*N*(PetscInt)((fftw->dim_fftw)[fftw->ndim_fftw-1])/((PetscInt) temp);
1042 
1043       (fftw->dim_fftw)[fftw->ndim_fftw-1] = temp;
1044 
1045       partial_dim = fftw->partial_dim;
1046 
1047       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx1);CHKERRQ(ierr);
1048       ierr = PetscMalloc(sizeof(PetscInt)*((PetscInt)local_n0)*partial_dim,&indx2);CHKERRQ(ierr);
1049 
1050       if (dim[ndim-1]%2==0) NM = 2;
1051       else                  NM = 1;
1052 
1053       j = low;
1054       for (i=0,k=1; i<((PetscInt)local_n0)*partial_dim; i++,k++) {
1055         indx1[i] = local_0_start*partial_dim + i;
1056         indx2[i] = j;
1057         if (k%dim[ndim-1]==0) j+=NM;
1058         j++;
1059       }
1060       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx1,PETSC_COPY_VALUES,&list1);CHKERRQ(ierr);
1061       ierr = ISCreateGeneral(comm,local_n0*partial_dim,indx2,PETSC_COPY_VALUES,&list2);CHKERRQ(ierr);
1062 
1063       ierr = VecScatterCreate(x,list2,y,list1,&vecscat);CHKERRQ(ierr);
1064       ierr = VecScatterBegin(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1065       ierr = VecScatterEnd(vecscat,x,y,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1066       ierr = VecScatterDestroy(&vecscat);CHKERRQ(ierr);
1067       ierr = ISDestroy(&list1);CHKERRQ(ierr);
1068       ierr = ISDestroy(&list2);CHKERRQ(ierr);
1069       ierr = PetscFree(indx1);CHKERRQ(ierr);
1070       ierr = PetscFree(indx2);CHKERRQ(ierr);
1071 #endif
1072       break;
1073     }
1074   }
1075   PetscFunctionReturn(0);
1076 }
1077 EXTERN_C_END
1078 
1079 EXTERN_C_BEGIN
1080 #undef __FUNCT__
1081 #define __FUNCT__ "MatCreate_FFTW"
1082 /*
1083     MatCreate_FFTW - Creates a matrix object that provides FFT via the external package FFTW
1084 
1085   Options Database Keys:
1086 + -mat_fftw_plannerflags - set FFTW planner flags
1087 
1088    Level: intermediate
1089 
1090 */
1091 PetscErrorCode MatCreate_FFTW(Mat A)
1092 {
1093   PetscErrorCode ierr;
1094   MPI_Comm       comm;
1095   Mat_FFT        *fft=(Mat_FFT*)A->data;
1096   Mat_FFTW       *fftw;
1097   PetscInt       n         =fft->n,N=fft->N,ndim=fft->ndim,*dim = fft->dim;
1098   const char     *p_flags[]={"FFTW_ESTIMATE","FFTW_MEASURE","FFTW_PATIENT","FFTW_EXHAUSTIVE"};
1099   PetscBool      flg;
1100   PetscInt       p_flag,partial_dim=1,ctr;
1101   PetscMPIInt    size,rank;
1102   ptrdiff_t      *pdim;
1103   ptrdiff_t      local_n1,local_1_start;
1104 #if !defined(PETSC_USE_COMPLEX)
1105   ptrdiff_t temp;
1106   PetscInt  N1,tot_dim;
1107 #else
1108   PetscInt n1;
1109 #endif
1110 
1111   PetscFunctionBegin;
1112   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1113   ierr = MPI_Comm_size(comm, &size);CHKERRQ(ierr);
1114   ierr = MPI_Comm_rank(comm, &rank);CHKERRQ(ierr);
1115 
1116   fftw_mpi_init();
1117   pdim    = (ptrdiff_t*)calloc(ndim,sizeof(ptrdiff_t));
1118   pdim[0] = dim[0];
1119 #if !defined(PETSC_USE_COMPLEX)
1120   tot_dim = 2*dim[0];
1121 #endif
1122   for (ctr=1; ctr<ndim; ctr++) {
1123     partial_dim *= dim[ctr];
1124     pdim[ctr]    = dim[ctr];
1125 #if !defined(PETSC_USE_COMPLEX)
1126     if (ctr==ndim-1) tot_dim *= (dim[ctr]/2+1);
1127     else             tot_dim *= dim[ctr];
1128 #endif
1129   }
1130 
1131   if (size == 1) {
1132 #if defined(PETSC_USE_COMPLEX)
1133     ierr = MatSetSizes(A,N,N,N,N);CHKERRQ(ierr);
1134     n    = N;
1135 #else
1136     ierr = MatSetSizes(A,tot_dim,tot_dim,tot_dim,tot_dim);CHKERRQ(ierr);
1137     n    = tot_dim;
1138 #endif
1139 
1140   } else {
1141     ptrdiff_t alloc_local,local_n0,local_0_start;
1142     switch (ndim) {
1143     case 1:
1144 #if !defined(PETSC_USE_COMPLEX)
1145       SETERRQ(comm,PETSC_ERR_SUP,"FFTW does not support parallel 1D real transform");
1146 #else
1147       alloc_local = fftw_mpi_local_size_1d(dim[0],comm,FFTW_FORWARD,FFTW_ESTIMATE,&local_n0,&local_0_start,&local_n1,&local_1_start);
1148 
1149       n    = (PetscInt)local_n0;
1150       n1   = (PetscInt)local_n1;
1151       ierr = MatSetSizes(A,n1,n,N,N);CHKERRQ(ierr);
1152 #endif
1153       break;
1154     case 2:
1155 #if defined(PETSC_USE_COMPLEX)
1156       alloc_local = fftw_mpi_local_size_2d(dim[0],dim[1],comm,&local_n0,&local_0_start);
1157       /*
1158        PetscSynchronizedPrintf(comm,"[%d] MatCreateSeqFFTW: local_n0, local_0_start %d %d, N %d,dim %d, %d\n",rank,(PetscInt)local_n0*dim[1],(PetscInt)local_0_start,m,dim[0],dim[1]);
1159        PetscSynchronizedFlush(comm);
1160        */
1161       n    = (PetscInt)local_n0*dim[1];
1162       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1163 #else
1164       alloc_local = fftw_mpi_local_size_2d_transposed(dim[0],dim[1]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1165 
1166       n    = 2*(PetscInt)local_n0*(dim[1]/2+1);
1167       ierr = MatSetSizes(A,n,n,2*dim[0]*(dim[1]/2+1),2*dim[0]*(dim[1]/2+1));
1168 #endif
1169       break;
1170     case 3:
1171 #if defined(PETSC_USE_COMPLEX)
1172       alloc_local = fftw_mpi_local_size_3d(dim[0],dim[1],dim[2],comm,&local_n0,&local_0_start);
1173 
1174       n    = (PetscInt)local_n0*dim[1]*dim[2];
1175       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1176 #else
1177       alloc_local = fftw_mpi_local_size_3d_transposed(dim[0],dim[1],dim[2]/2+1,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1178 
1179       n   = 2*(PetscInt)local_n0*dim[1]*(dim[2]/2+1);
1180       ierr = MatSetSizes(A,n,n,2*dim[0]*dim[1]*(dim[2]/2+1),2*dim[0]*dim[1]*(dim[2]/2+1));
1181 #endif
1182       break;
1183     default:
1184 #if defined(PETSC_USE_COMPLEX)
1185       alloc_local = fftw_mpi_local_size(ndim,pdim,comm,&local_n0,&local_0_start);
1186 
1187       n    = (PetscInt)local_n0*partial_dim;
1188       ierr = MatSetSizes(A,n,n,N,N);CHKERRQ(ierr);
1189 #else
1190       temp = pdim[ndim-1];
1191 
1192       pdim[ndim-1] = temp/2 + 1;
1193 
1194       alloc_local = fftw_mpi_local_size_transposed(ndim,pdim,comm,&local_n0,&local_0_start,&local_n1,&local_1_start);
1195 
1196       n  = 2*(PetscInt)local_n0*partial_dim*pdim[ndim-1]/temp;
1197       N1 = 2*N*(PetscInt)pdim[ndim-1]/((PetscInt) temp);
1198 
1199       pdim[ndim-1] = temp;
1200 
1201       ierr = MatSetSizes(A,n,n,N1,N1);CHKERRQ(ierr);
1202 #endif
1203       break;
1204     }
1205   }
1206   ierr      = PetscObjectChangeTypeName((PetscObject)A,MATFFTW);CHKERRQ(ierr);
1207   ierr      = PetscNewLog(A,Mat_FFTW,&fftw);CHKERRQ(ierr);
1208   fft->data = (void*)fftw;
1209 
1210   fft->n            = n;
1211   fftw->ndim_fftw   = (ptrdiff_t)ndim; /* This is dimension of fft */
1212   fftw->partial_dim = partial_dim;
1213 
1214   ierr = PetscMalloc(ndim*sizeof(ptrdiff_t), (ptrdiff_t*)&(fftw->dim_fftw));CHKERRQ(ierr);
1215 
1216   for (ctr=0;ctr<ndim;ctr++) (fftw->dim_fftw)[ctr]=dim[ctr];
1217 
1218   fftw->p_forward  = 0;
1219   fftw->p_backward = 0;
1220   fftw->p_flag     = FFTW_ESTIMATE;
1221 
1222   if (size == 1) {
1223     A->ops->mult          = MatMult_SeqFFTW;
1224     A->ops->multtranspose = MatMultTranspose_SeqFFTW;
1225   } else {
1226     A->ops->mult          = MatMult_MPIFFTW;
1227     A->ops->multtranspose = MatMultTranspose_MPIFFTW;
1228   }
1229   fft->matdestroy = MatDestroy_FFTW;
1230   A->assembled    = PETSC_TRUE;
1231   A->preallocated = PETSC_TRUE;
1232 
1233   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatGetVecsFFTW_C","MatGetVecsFFTW_FFTW",MatGetVecsFFTW_FFTW);CHKERRQ(ierr);
1234   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterPetscToFFTW_C","VecScatterPetscToFFTW_FFTW",VecScatterPetscToFFTW_FFTW);CHKERRQ(ierr);
1235   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"VecScatterFFTWToPetsc_C","VecScatterFFTWToPetsc_FFTW",VecScatterFFTWToPetsc_FFTW);CHKERRQ(ierr);
1236 
1237   /* get runtime options */
1238   ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"FFTW Options","Mat");CHKERRQ(ierr);
1239   ierr = PetscOptionsEList("-mat_fftw_plannerflags","Planner Flags","None",p_flags,4,p_flags[0],&p_flag,&flg);CHKERRQ(ierr);
1240   if (flg) fftw->p_flag = (unsigned)p_flag;
1241   ierr = PetscOptionsEnd();CHKERRQ(ierr);
1242   PetscFunctionReturn(0);
1243 }
1244 EXTERN_C_END
1245 
1246 
1247 
1248 
1249