#include #include #include #include "fft.h" static int Powerof2(int v, int *m, int *twopm); static int FFT(int dir,int m, int nn, FLOAT *r, FLOAT *i); // given v, return m=log2(v) and tpowpm = 2^m static int Powerof2(int v, int *m, int *twopm) { int nn = 1; int mm=0; if (v<=0) return 0; while(nnwidth; int height = c->height; int i,j,o; int m,twopm; static FLOAT *dwn_r=NULL; static FLOAT *dwn_i=NULL; static int across=0,down=0; if (!Powerof2(width,&m,&twopm) || twopm != width) return(FALSE); for (j=o=0;jreal+o,c->imag+o); } if (!Powerof2(height,&m,&twopm) || twopm != height) return(FALSE); if (dwn_r==NULL || dwn_i == NULL || down != height) { if (dwn_r) VFree(dwn_r); if (dwn_i) VFree(dwn_i); dwn_r = VMalloc(width * height * S_FLOAT); dwn_i = VMalloc(width * height * S_FLOAT); down = height; } transpose_matrix(c->real,dwn_r,width,height); transpose_matrix(c->imag,dwn_i,width,height); i=width; width=height; height=i; // Make sure the FFT_REAL flag is cleared since it's no longer right flags &= ~FFT_REAL; for (j=o=0;jreal,width,height); transpose_matrix(dwn_i,c->imag,width,height); return(TRUE); } /***************************************************************************************** * * Bit Reverse Table * * A collection of 1D tables (one for each linear dimension we are interested in) * that map X[n] -> X[m] where m is the bit-reversed equivalent of n * * One table is required for each unique dimension size. Generate tables for * sizes 1 to MAX_P2. Assume the largest size we will ever want is 64k so we can * use 16 bit indexes. */ typedef unsigned short SHORT; static SHORT *BRT[MAX_P2+1]; static int __done_precalculate_brt = 0; /* * bit reverse the bottom "b" bits of an unsigned 16 bit word * * 4 3 2 1 0 4 3 2 1 0 * ie b=5 : [b4,b3,b2,b1,b0] => [b0,b1,b2,b3,b4] */ static inline unsigned int __bit_reverse(SHORT n,int b) { SHORT m = 0; SHORT top; int nn=n; if (b==0) return n; top = 1 << (b-1); while(top) { if (n&1) m |= top; top>>=1; n>>=1; } //printf("%d: reversed %d -> %d\n",b,nn,m); return m; } static void __precalculate_brt() { int m; int N; if (__done_precalculate_brt) return; N=1; for(m=0; m<=MAX_P2; ++m, N<<=1) { int i; SHORT *ptr = (SHORT *) VMalloc(N * sizeof(SHORT *)); for(i=0; i>1; nn=n; if (l>=2) { for(j=0; j>1; nn=n; if (l>=2) { for(j=0; j x(k) e = forward transform N / n=0..N-1 --- k=0 Formula: reverse N-1 --- \ j k 2 pi n / N X(n) = > x(k) e = forward transform / n=0..N-1 --- k=0 Assumptions: - The smallest size handled is 4 elements, due to the optimisation that rolls the first 2 passes together. */ static int FFT(int flags,int m,int nn,FLOAT *real, FLOAT *imag) { long i,i1,j,k,i2,l,l1,l2; int dir; SHORT *brt; FLOAT *R,*I,*RI,*RJ,*II,*IJ,*I2,*J2; FLOAT *U1,*U2; if (m>MAX_P2) { printf("FFT: %d: Maximum power of 2 exceeded\n",m); exit(1); } if (m<2) { printf("FFT: Minimum allowed size is 4 elements\n"); exit(0); } // Check if we need to init our tables for twiddle values and bit reversal if (! __done_precalculate_fft_twiddle) __precalculate_fft_twiddle(); if (! __done_precalculate_brt) __precalculate_brt(); if (flags&FFT_FORWARD) { // Forward direction U1 = U1_F; U2 = U2_F; dir=1; } else if (flags&FFT_REVERSE) { // Reverse direction U1 = U1_R; U2 = U2_R; dir=-1; } else { printf("FFT: Must have one of FFT_FORWARD or FFT_REVERSE\n"); exit(0); } /************************************************************************ * Do the bit reversal via lookup table BRT[] * * reverse the real[] and imag[] arrays separately to improve * cache locality */ if (! (flags & FFT_NO_BRT)) { // bit-reverse real[] array R = real; brt = BRT[m]; for(i=0; i>1); RI += l1; RJ += l1; II += l1; IJ += l1; } U1 += l1; U2 += l1; //for(i=0; i>=2; while(nn-- > 0) { VFLOAT(real) /= nv.v; VFLOAT(imag) /= nv.v; real+=4; imag+=4; } } return(TRUE); } /* Force data to be 16-byte aligned in case we want to use sse vectors */ COMPLEX * create_complex(int w, int h) { COMPLEX *c = (COMPLEX *) VMalloc(sizeof(COMPLEX)); c->width = w; c->height = h; c->real = (FLOAT *) VMalloc(S_FLOAT * w * h); c->imag = (FLOAT *) VMalloc(S_FLOAT * w * h); return c; } void destroy_complex(COMPLEX *c) { VFree(c->real); VFree(c->imag); VFree(c); } /* * allocate a block of memory with malloc() and align it to the natural size of our vector * type (f4vector). * * We do this by allocating sizeof(f4vector) extra bytes and then moving the start pointer * forward by enough bytes to place on the correct boundary. We store an unsigned char * value at offset (-1) indicating how far we have moved this pointer so we can reset the * pointer before calling free(). * * Note if the value returned by malloc() was already aligned then we waste the most memory, * by moving our pointer forward by sizeof(f4vector). * We also allocate an extra 16 bytes over and above the 16 needed for alignment in case * the caller will be using 16-byte vector operations on non-vector data, making sure * there's room at the end for the last vector op. */ static int __v_align = sizeof(f4vector); static int __done_align_check = 0; static inline void __do_align_check() { if (! __done_align_check) { // vector size must be a power of two for this to work int i=1; while(i && i != __v_align) i<<=1; if (i!=__v_align) { printf("VMalloc: Vector size %d (sizeof(f4vector)) non power of two\n",__v_align); exit(1); } __done_align_check = 1; } } void * VMalloc(int n) { unsigned int i; unsigned char *buf; unsigned char *ptr,ch; __do_align_check(); buf = (unsigned char *) malloc(n + __v_align + 16); if (buf == NULL) { printf("VMalloc: out of memory requesting %d bytes\n",n); exit(1); } // set ptr to next alignment boundary inside buf ptr = (unsigned char *) (((unsigned int)buf+__v_align) & ~(__v_align-1)); // how far did we move? ch = (unsigned int)(ptr - buf); // Store the alignment offset in the preceding byte *(ptr-1) = ch; return ptr; } /* * Free memory allocated with VMalloc(). * * The unsigned byte at location *(ptr-1) gives the offset back to the address * provided by malloc. */ void VFree(void *buf) { unsigned char *ptr = (unsigned char *)buf; unsigned int ch = *(ptr-1); int i,loff; ptr -= ch; free(ptr); } inline void _vector_zero_F(FLOAT *dst, int count) { f4vector z; LOADVC(z,0); count /= V_ELEMENTS; while(count--) { VFLOAT(dst) = z.v; dst += V_ELEMENTS; } } inline void _vector_copy_F2F(FLOAT *src, FLOAT *dst, int count) { while(count) { VFLOAT(dst) = VFLOAT(src); count -= V_ELEMENTS; src += V_ELEMENTS; dst += V_ELEMENTS; } } inline void _vector_multiply_Ff(FLOAT *data, float f, int count) { f4vector z; LOADVC(z,f); count /= V_ELEMENTS; while(count--) { VFLOAT(data) *= z.v; data += V_ELEMENTS; } }