1 2 /* 3 PetscKernelAXPY - X = X + alpha * Y 4 5 Input Parameters: 6 + X, Y - arrays 7 . alpha - scalar 8 - n - length of arrays 9 10 Also PetscKernelAXPY2(), PetscKernelAXPY3(), PetscKernelAXPY4() 11 12 */ 13 14 #ifndef PetscKernelAXPY 15 16 #if defined(PETSC_USE_FORTRAN_KERNEL_MAXPY) 17 #if defined(PETSC_HAVE_FORTRAN_CAPS) 18 #define fortranmaxpy4_ FORTRANMAXPY4 19 #define fortranmaxpy3_ FORTRANMAXPY3 20 #define fortranmaxpy2_ FORTRANMAXPY2 21 #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE) 22 #define fortranmaxpy4_ fortranmaxpy4 23 #define fortranmaxpy3_ fortranmaxpy3 24 #define fortranmaxpy2_ fortranmaxpy2 25 #endif 26 PETSC_EXTERN void fortranmaxpy4_(void*,void*,void*,void*,void*,const void*,const void*,const void*,const void*,PetscInt*); 27 PETSC_EXTERN void fortranmaxpy3_(void*,void*,void*,void*,const void*,const void*,const void*,PetscInt*); 28 PETSC_EXTERN void fortranmaxpy2_(void*,void*,void*,const void*,const void*,PetscInt*); 29 #endif 30 #include <petscblaslapack.h> 31 32 #if defined(PETSC_USE_FORTRAN_KERNEL_MAXPY) 33 #define PetscKernelAXPY(U,a1,p1,n) {PetscBLASInt one=1; PetscBLASInt nn = (PetscBLASInt) n; PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&nn,&a1,p1,&one,U,&one));} 34 #define PetscKernelAXPY2(U,a1,a2,p1,p2,n) {fortranmaxpy2_(U,&a1,&a2,p1,p2,&n);} 35 #define PetscKernelAXPY3(U,a1,a2,a3,p1,p2,p3,n) {fortranmaxpy3_(U,&a1,&a2,&a3,p1,p2,p3,&n);} 36 #define PetscKernelAXPY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){fortranmaxpy4_(U,&a1,&a2,&a3,&a4,p1,p2,p3,p4,&n);} 37 38 #elif defined(PETSC_USE_UNROLL_KERNELS) 39 40 #define PetscKernelAXPY(U,Alpha,P,n) {\ 41 switch (n & 0x3) {\ 42 case 3: *U++ += Alpha * *P++;\ 43 case 2: *U++ += Alpha * *P++;\ 44 case 1: *U++ += Alpha * *P++;\ 45 n -= 4;case 0: break;}while (n>0) {U[0] += Alpha * P[0];U[1] += Alpha * P[1]; U[2] += Alpha * P[2]; U[3] += Alpha * P[3];U += 4; P += 4; n -= 4;}} 46 #define PetscKernelAXPY2(U,a1,a2,p1,p2,n) {\ 47 switch (n & 0x3) {\ 48 case 3: *U++ += a1 * *p1++ + a2 * *p2++;\ 49 case 2: *U++ += a1 * *p1++ + a2 * *p2++;\ 50 case 1: *U++ += a1 * *p1++ + a2 * *p2++;\ 51 n -= 4;case 0: break;}\ 52 while (n>0) {U[0]+=a1*p1[0]+a2*p2[0];U[1]+=a1*p1[1]+a2*p2[1]; U[2]+=a1*p1[2]+a2*p2[2];U[3]+=a1*p1[3]+a2*p2[3];U+=4;p1+=4;p2+=4;n -= 4;}} 53 #define PetscKernelAXPY3(U,a1,a2,a3,p1,p2,p3,n) {\ 54 switch (n & 0x3) {\ 55 case 3: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\ 56 case 2: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\ 57 case 1: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\ 58 n -= 4;case 0:break;}\ 59 while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0];U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1];U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2];U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3];U+=4;p1+=4;p2+=4;p3+=4;n-=4;}} 60 #define PetscKernelAXPY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {\ 61 switch (n & 0x3) {\ 62 case 3: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\ 63 case 2: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\ 64 case 1: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\ 65 n -= 4;case 0:break;}\ 66 while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1]+a4*p4[1];U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2]+a4*p4[2];U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3]+a4*p4[3];U+=4;p1+=4;p2+=4;p3+=4;p4+=4;n-=4;}} 67 68 #elif defined(PETSC_USE_WHILE_KERNELS) 69 70 #define PetscKernelAXPY(U,a1,p1,n) {while (n--) *U++ += a1 * *p1++;} 71 #define PetscKernelAXPY2(U,a1,a2,p1,p2,n) {while (n--) *U++ += a1 * *p1++ + a2 * *p2++;} 72 #define PetscKernelAXPY3(U,a1,a2,a3,p1,p2,p3,n) {while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;} 73 #define PetscKernelAXPY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;} 74 75 #elif defined(PETSC_USE_BLAS_KERNELS) 76 77 #define PetscKernelAXPY(U,a1,p1,n) {PetscBLASInt one=1; PetscBLASInt nn = (PetscBLASInt) n; PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&nn,&a1,p1,&one,U,&one));} 78 #define PetscKernelAXPY2(U,a1,a2,p1,p2,n){PetscKernelAXPY(U,a1,p1,n); PetscKernelAXPY(U,a2,p2,n);} 79 #define PetscKernelAXPY3(U,a1,a2,a3,p1,p2,p3,n){PetscKernelAXPY2(U,a1,a2,p1,p2,n); PetscKernelAXPY(U,a3,p3,n);} 80 #define PetscKernelAXPY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){PetscKernelAXPY2(U,a1,a2,p1,p2,n); PetscKernelAXPY2(U,a3,a4,p3,p4,n);} 81 82 #elif defined(PETSC_USE_FOR_KERNELS) 83 84 #define PetscKernelAXPY(U,a1,p1,n) {PetscInt __i;PetscScalar __s1,__s2; \ 85 for (__i=0;__i<n-1;__i+=2){\ 86 __s1=a1*p1[__i];__s2=a1*p1[__i+1]; __s1+=U[__i];__s2+=U[__i+1];U[__i]=__s1;U[__i+1]=__s2;}\ 87 if (n & 0x1) U[__i] += a1 * p1[__i];} 88 #define PetscKernelAXPY2(U,a1,a2,p1,p2,n) {PetscInt __i; for (__i=0;__i<n;__i++) U[__i] += a1 * p1[__i] + a2 * p2[__i];} 89 #define PetscKernelAXPY3(U,a1,a2,a3,p1,p2,p3,n){PetscInt __i; for (__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];} 90 #define PetscKernelAXPY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){PetscInt __i;for (__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];} 91 92 #else 93 94 #define PetscKernelAXPY(U,a1,p1,n) {PetscInt __i;PetscScalar _a1=a1; for (__i=0;__i<n;__i++)U[__i]+=_a1 * p1[__i];} 95 #define PetscKernelAXPY2(U,a1,a2,p1,p2,n) {PetscInt __i; for (__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];} 96 #define PetscKernelAXPY3(U,a1,a2,a3,p1,p2,p3,n){PetscInt __i; for (__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];} 97 #define PetscKernelAXPY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){PetscInt __i;for (__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];} 98 99 #endif 100 101 #endif 102