From 6b3a3db1c9f076b622653a35e50d47afe5297e24 Mon Sep 17 00:00:00 2001
From: Eric Kooistra <>
Date: Fri, 18 Mar 2022 15:04:46 +0100
Subject: [PATCH] Round outside separate function in output quantizer, to avoid
 more inaccurate rounding of 1 LSbit in separate for bit growth.

 libraries/dsp/fft/src/vhdl/fft_r2_par.vhd     | 140 +++++------------
 libraries/dsp/fft/src/vhdl/fft_r2_pipe.vhd    |  32 ++--
 libraries/dsp/fft/src/vhdl/fft_r2_wide.vhd    |  32 ++--
 .../fft/src/vhdl/fft_reorder_sepa_pipe.vhd    |  12 +-
 libraries/dsp/fft/src/vhdl/fft_sepa.vhd       | 143 ++++++------------
 libraries/dsp/fft/src/vhdl/fft_sepa_wide.vhd  |  68 +++++----
 6 files changed, 163 insertions(+), 264 deletions(-)

diff --git a/libraries/dsp/fft/src/vhdl/fft_r2_par.vhd b/libraries/dsp/fft/src/vhdl/fft_r2_par.vhd
index c9089c8de0..02bf10f3cd 100644
--- a/libraries/dsp/fft/src/vhdl/fft_r2_par.vhd
+++ b/libraries/dsp/fft/src/vhdl/fft_r2_par.vhd
@@ -125,30 +125,37 @@ architecture str of fft_r2_par is
   constant c_pipeline_add_sub    : natural := 1;
   constant c_pipeline_remove_lsb : natural := 1;
-  constant c_sepa_round          : boolean := true;  -- must be true, because separate should round the 1 bit growth
   constant c_nof_stages         : natural := ceil_log2(g_fft.nof_points);  
   constant c_nof_bf_per_stage   : natural := g_fft.nof_points/2;  
   constant c_in_scale_w_tester  : integer := g_fft.stage_dat_w - g_fft.in_dat_w - sel_a_b(g_fft.guard_enable, g_fft.guard_w, 0);
   constant c_in_scale_w         : natural := sel_a_b(c_in_scale_w_tester > 0, c_in_scale_w_tester, 0);  -- Only scale when in_dat_w is not too big. 
   constant c_out_scale_w        : integer := g_fft.stage_dat_w - g_fft.out_dat_w - g_fft.out_gain_w;    -- Estimate number of LSBs to throw away when > 0 or insert when < 0
+  constant c_sepa_growth_w      : natural := sel_a_b(g_fft.use_separate, 1, 0);  -- add one bit for add sub growth in separate
+  constant c_raw_dat_w          : natural := g_fft.stage_dat_w + c_sepa_growth_w;
   type   t_stage_dat_arr   is array (integer range <>)     of std_logic_vector(g_fft.stage_dat_w-1 downto 0);
-  type   t_stage_sum_arr   is array (integer range <>)     of std_logic_vector(g_fft.stage_dat_w   downto 0);
+  type   t_stage_raw_arr   is array (integer range <>)     of std_logic_vector(c_raw_dat_w-1 downto 0);
   type   t_data_arr2       is array(c_nof_stages downto 0) of t_stage_dat_arr(g_fft.nof_points-1 downto 0);
   type   t_val_arr         is array(c_nof_stages downto 0) of std_logic_vector( g_fft.nof_points-1 downto 0);
   signal data_re          : t_data_arr2;
   signal data_im          : t_data_arr2;
   signal data_val         : t_val_arr;
   signal int_re_arr       : t_stage_dat_arr(g_fft.nof_points-1 downto 0);
   signal int_im_arr       : t_stage_dat_arr(g_fft.nof_points-1 downto 0);
-  signal fft_re_arr       : t_stage_dat_arr(g_fft.nof_points-1 downto 0);
-  signal fft_im_arr       : t_stage_dat_arr(g_fft.nof_points-1 downto 0);
-  signal add_arr          : t_stage_sum_arr(g_fft.nof_points-1 downto 0);
-  signal sub_arr          : t_stage_sum_arr(g_fft.nof_points-1 downto 0);
   signal int_val          : std_logic;
+  signal int_a_dc         : std_logic_vector(g_fft.stage_dat_w-1 downto 0);
+  signal int_b_dc         : std_logic_vector(g_fft.stage_dat_w-1 downto 0);
+  signal add_arr          : t_stage_raw_arr(g_fft.nof_points-1 downto 0);
+  signal sub_arr          : t_stage_raw_arr(g_fft.nof_points-1 downto 0);
+  signal fft_re_arr       : t_stage_raw_arr(g_fft.nof_points-1 downto 0);
+  signal fft_im_arr       : t_stage_raw_arr(g_fft.nof_points-1 downto 0);
   signal fft_val          : std_logic;
@@ -235,7 +242,7 @@ begin
         g_pipeline_input  => 0, 
         g_pipeline_output => c_pipeline_add_sub, 
         g_in_dat_w        => g_fft.stage_dat_w,    
-        g_out_dat_w       => g_fft.stage_dat_w+1
+        g_out_dat_w       => c_raw_dat_w
       port map (
         clk     => clk,
@@ -251,7 +258,7 @@ begin
         g_pipeline_input  => 0, 
         g_pipeline_output => c_pipeline_add_sub, 
         g_in_dat_w        => g_fft.stage_dat_w,
-        g_out_dat_w       => g_fft.stage_dat_w+1
+        g_out_dat_w       => c_raw_dat_w
       port map (
         clk     => clk,
@@ -267,7 +274,7 @@ begin
         g_pipeline_input  => 0, 
         g_pipeline_output => c_pipeline_add_sub, 
         g_in_dat_w        => g_fft.stage_dat_w,   
-        g_out_dat_w       => g_fft.stage_dat_w+1
+        g_out_dat_w       => c_raw_dat_w
       port map (
         clk     => clk,
@@ -283,7 +290,7 @@ begin
         g_pipeline_input  => 0, 
         g_pipeline_output => c_pipeline_add_sub, 
         g_in_dat_w        => g_fft.stage_dat_w,   
-        g_out_dat_w       => g_fft.stage_dat_w+1
+        g_out_dat_w       => c_raw_dat_w
       port map (
         clk     => clk,
@@ -292,84 +299,14 @@ begin
         result  => sub_arr(2*I+1)
-      gen_sepa_truncate : IF c_sepa_round=false GENERATE
-        -- truncate the one LSbit
-        fft_re_arr(2*I  ) <= add_arr(2*I  )(g_fft.stage_dat_w DOWNTO 1);  -- A real
-        fft_re_arr(2*I+1) <= add_arr(2*I+1)(g_fft.stage_dat_w DOWNTO 1);  -- B real
-        fft_im_arr(2*I  ) <= sub_arr(2*I  )(g_fft.stage_dat_w DOWNTO 1);  -- A imag
-        fft_im_arr(2*I+1) <= sub_arr(2*I+1)(g_fft.stage_dat_w DOWNTO 1);  -- B imag
-      end generate;
-      gen_sepa_round : IF c_sepa_round=true GENERATE
-        -- round the one LSbit
-        round_re_a : ENTITY common_lib.common_round
-        GENERIC MAP (
-          g_representation  => "SIGNED",  -- SIGNED (round +-0.5 away from zero to +- infinity) or UNSIGNED rounding (round 0.5 up to + inifinity)
-          g_round           => TRUE,      -- when TRUE round the input, else truncate the input
-          g_round_clip      => FALSE,     -- when TRUE clip rounded input >= +max to avoid wrapping to output -min (signed) or 0 (unsigned)
-          g_pipeline_input  => 0,         -- >= 0
-          g_pipeline_output => 0,         -- >= 0, use g_pipeline_input=0 and g_pipeline_output=0 for combinatorial output
-          g_in_dat_w        => g_fft.stage_dat_w+1,
-          g_out_dat_w       => g_fft.stage_dat_w
-        )
-        PORT MAP (
-          clk        => clk,
-          in_dat     => add_arr(2*I),
-          out_dat    => fft_re_arr(2*I)
-        );
-        round_re_b : ENTITY common_lib.common_round
-        GENERIC MAP (
-          g_representation  => "SIGNED",  -- SIGNED (round +-0.5 away from zero to +- infinity) or UNSIGNED rounding (round 0.5 up to + inifinity)
-          g_round           => TRUE,      -- when TRUE round the input, else truncate the input
-          g_round_clip      => FALSE,     -- when TRUE clip rounded input >= +max to avoid wrapping to output -min (signed) or 0 (unsigned)
-          g_pipeline_input  => 0,         -- >= 0
-          g_pipeline_output => 0,         -- >= 0, use g_pipeline_input=0 and g_pipeline_output=0 for combinatorial output
-          g_in_dat_w        => g_fft.stage_dat_w+1,
-          g_out_dat_w       => g_fft.stage_dat_w
-        )
-        PORT MAP (
-          clk        => clk,
-          in_dat     => add_arr(2*I+1),
-          out_dat    => fft_re_arr(2*I+1)
-        );
-        round_im_a : ENTITY common_lib.common_round
-        GENERIC MAP (
-          g_representation  => "SIGNED",  -- SIGNED (round +-0.5 away from zero to +- infinity) or UNSIGNED rounding (round 0.5 up to + inifinity)
-          g_round           => TRUE,      -- when TRUE round the input, else truncate the input
-          g_round_clip      => FALSE,     -- when TRUE clip rounded input >= +max to avoid wrapping to output -min (signed) or 0 (unsigned)
-          g_pipeline_input  => 0,         -- >= 0
-          g_pipeline_output => 0,         -- >= 0, use g_pipeline_input=0 and g_pipeline_output=0 for combinatorial output
-          g_in_dat_w        => g_fft.stage_dat_w+1,
-          g_out_dat_w       => g_fft.stage_dat_w
-        )
-        PORT MAP (
-          clk        => clk,
-          in_dat     => sub_arr(2*I),
-          out_dat    => fft_im_arr(2*I)
-        );
-        round_im_b : ENTITY common_lib.common_round
-        GENERIC MAP (
-          g_representation  => "SIGNED",  -- SIGNED (round +-0.5 away from zero to +- infinity) or UNSIGNED rounding (round 0.5 up to + inifinity)
-          g_round           => TRUE,      -- when TRUE round the input, else truncate the input
-          g_round_clip      => FALSE,     -- when TRUE clip rounded input >= +max to avoid wrapping to output -min (signed) or 0 (unsigned)
-          g_pipeline_input  => 0,         -- >= 0
-          g_pipeline_output => 0,         -- >= 0, use g_pipeline_input=0 and g_pipeline_output=0 for combinatorial output
-          g_in_dat_w        => g_fft.stage_dat_w+1,
-          g_out_dat_w       => g_fft.stage_dat_w
-        )
-        PORT MAP (
-          clk        => clk,
-          in_dat     => sub_arr(2*I+1),
-          out_dat    => fft_im_arr(2*I+1)
-        );
-      end generate;
+      fft_re_arr(2*I  ) <= add_arr(2*I  )(c_raw_dat_w-1 DOWNTO 0);  -- A real
+      fft_re_arr(2*I+1) <= add_arr(2*I+1)(c_raw_dat_w-1 DOWNTO 0);  -- B real
+      fft_im_arr(2*I  ) <= sub_arr(2*I  )(c_raw_dat_w-1 DOWNTO 0);  -- A imag
+      fft_im_arr(2*I+1) <= sub_arr(2*I+1)(c_raw_dat_w-1 DOWNTO 0);  -- B imag
     end generate;
-    -- Generate bin 0 directly
+    -- Generate bin 0 = DC directly
     -- Index N=g_fft.nof_points wraps to index 0:
     -- . fft_re_arr(0) = (int_re_arr(0) + int_re_arr(N)) / 2 = int_re_arr(0)
@@ -379,28 +316,34 @@ begin
     u_pipeline_a_re_0 : entity common_lib.common_pipeline
     generic map (
-      g_pipeline  => c_pipeline_add_sub,
-      g_in_dat_w  => g_fft.stage_dat_w,
-      g_out_dat_w => g_fft.stage_dat_w
+      g_representation => "SIGNED",
+      g_pipeline       => c_pipeline_add_sub,
+      g_in_dat_w       => g_fft.stage_dat_w,
+      g_out_dat_w      => g_fft.stage_dat_w
     port map (
       clk     => clk,
       in_dat  => int_re_arr(0),
-      out_dat => fft_re_arr(0)
+      out_dat => int_a_dc
     u_pipeline_b_re_0 : entity common_lib.common_pipeline
     generic map (
-      g_pipeline  => c_pipeline_add_sub,
-      g_in_dat_w  => g_fft.stage_dat_w,
-      g_out_dat_w => g_fft.stage_dat_w
+      g_representation => "SIGNED",
+      g_pipeline       => c_pipeline_add_sub,
+      g_in_dat_w       => g_fft.stage_dat_w,
+      g_out_dat_w      => g_fft.stage_dat_w
     port map (
       clk     => clk,
       in_dat  => int_im_arr(0),
-      out_dat => fft_re_arr(1)
+      out_dat => int_b_dc
+    -- The real outputs of A(0) and B(0) are scaled by shift left is * 2 for separate add
+    fft_re_arr(0) <= int_a_dc & '0';
+    fft_re_arr(1) <= int_b_dc & '0';
     -- The imaginary outputs of A(0) and B(0) are always zero in case two real inputs are provided
     fft_im_arr(0) <= (others=>'0');
     fft_im_arr(1) <= (others=>'0');
@@ -421,6 +364,7 @@ begin
   no_separate : if g_fft.use_separate=false generate 
     assign_outputs : for I in 0 to g_fft.nof_points-1 generate
+      -- c_raw_dat_w = g_fft.stage_dat_w, because g_fft.use_separate=false
       fft_re_arr(I) <= int_re_arr(I);    
       fft_im_arr(I) <= int_im_arr(I);  
     end generate;
@@ -434,14 +378,14 @@ begin
     u_requantize_re : entity common_lib.common_requantize
     generic map (
       g_representation      => "SIGNED",      
-      g_lsb_w               => c_out_scale_w,      
+      g_lsb_w               => c_out_scale_w + c_sepa_growth_w,
       g_lsb_round           => TRUE,           
       g_lsb_round_clip      => FALSE,      
       g_msb_clip            => FALSE,            
       g_msb_clip_symmetric  => FALSE,  
       g_pipeline_remove_lsb => c_pipeline_remove_lsb, 
       g_pipeline_remove_msb => 0, 
-      g_in_dat_w            => g_fft.stage_dat_w,            
+      g_in_dat_w            => c_raw_dat_w,
       g_out_dat_w           => g_fft.out_dat_w
     port map (
@@ -454,14 +398,14 @@ begin
     u_requantize_im : entity common_lib.common_requantize
     generic map (
       g_representation      => "SIGNED",      
-      g_lsb_w               => c_out_scale_w,
+      g_lsb_w               => c_out_scale_w + c_sepa_growth_w,
       g_lsb_round           => TRUE,           
       g_lsb_round_clip      => FALSE,      
       g_msb_clip            => FALSE,            
       g_msb_clip_symmetric  => FALSE,  
       g_pipeline_remove_lsb => c_pipeline_remove_lsb, 
       g_pipeline_remove_msb => 0, 
-      g_in_dat_w            => g_fft.stage_dat_w,            
+      g_in_dat_w            => c_raw_dat_w,
       g_out_dat_w           => g_fft.out_dat_w
     port map (
diff --git a/libraries/dsp/fft/src/vhdl/fft_r2_pipe.vhd b/libraries/dsp/fft/src/vhdl/fft_r2_pipe.vhd
index 00c2007bd8..994f865331 100644
--- a/libraries/dsp/fft/src/vhdl/fft_r2_pipe.vhd
+++ b/libraries/dsp/fft/src/vhdl/fft_r2_pipe.vhd
@@ -100,7 +100,8 @@ architecture str of fft_r2_pipe is
   constant c_in_scale_w         : natural := g_fft.stage_dat_w - g_fft.in_dat_w - sel_a_b(g_fft.guard_enable, g_fft.guard_w, 0);              
   constant c_out_scale_w        : integer := g_fft.stage_dat_w - g_fft.out_dat_w - g_fft.out_gain_w;  -- Estimate number of LSBs to throw throw away when > 0 or insert when < 0
   constant c_raw_dat_extra_w    : natural := sel_a_b(g_fft.use_separate, g_sepa_extra_w, 0);
-  constant c_raw_dat_w          : natural := g_fft.stage_dat_w + c_raw_dat_extra_w;
+  constant c_sepa_growth_w      : natural := sel_a_b(g_fft.use_separate, 1, 0);  -- add one bit for add sub growth in separate
+  constant c_raw_dat_w          : natural := g_fft.stage_dat_w + c_sepa_growth_w;
   -- number the stage instances from c_nof_stages:1
   -- . the data input for the first stage has index c_nof_stages
@@ -117,12 +118,10 @@ architecture str of fft_r2_pipe is
   signal data_re      : t_data_arr;
   signal data_im      : t_data_arr;
-  signal last_re      : std_logic_vector(c_raw_dat_w-1 downto 0);
-  signal last_im      : std_logic_vector(c_raw_dat_w-1 downto 0);
   signal data_val     : std_logic_vector(c_nof_stages downto 0):= (others=>'0');
+  signal in_cplx      : std_logic_vector(c_nof_complex*g_fft.stage_dat_w-1 downto 0);
   signal out_cplx     : std_logic_vector(c_nof_complex*c_raw_dat_w-1 downto 0);
-  signal in_cplx      : std_logic_vector(c_nof_complex*c_raw_dat_w-1 downto 0);
   signal raw_out_re   : std_logic_vector(c_raw_dat_w-1 downto 0);
   signal raw_out_im   : std_logic_vector(c_raw_dat_w-1 downto 0);
   signal raw_out_val  : std_logic;
@@ -209,16 +208,16 @@ begin
     in_re     => data_re(1),
     in_im     => data_im(1),
     in_val    => data_val(1),
-    out_re    => last_re,  -- = data_re(0), but may instead have c_raw_dat_w bits
-    out_im    => last_im,  -- = data_im(0), but may instead have c_raw_dat_w bits
+    out_re    => data_re(0),
+    out_im    => data_im(0),
     out_val   => data_val(0)
   -- Optional output reorder and separation
-  gen_reorder_and_separate : if(g_fft.use_separate or g_fft.use_reorder) generate 
-    in_cplx <= last_im & last_re;
+  gen_reorder_and_separate : if g_fft.use_separate or g_fft.use_reorder generate
+    in_cplx <= data_im(0) & data_re(0);
     u_reorder_sep : entity work.fft_reorder_sepa_pipe
     generic map (
@@ -232,20 +231,23 @@ begin
     port map (
       clk     => clk,
       rst     => rst,
-      in_dat  => in_cplx,
+      in_dat  => in_cplx,      -- c_nof_complex * g_fft.stage_dat_w
       in_val  => data_val(0),
-      out_dat => out_cplx,
+      out_dat => out_cplx,     -- c_nof_complex * c_raw_dat_w
       out_val => raw_out_val
+    -- c_raw_dat_w = g_fft.stage_dat_w     when g_fft.use_separate = false
+    -- c_raw_dat_w = g_fft.stage_dat_w + 1 when g_fft.use_separate = true
     raw_out_re <= out_cplx(  c_raw_dat_w-1 downto 0);
     raw_out_im <= out_cplx(2*c_raw_dat_w-1 downto c_raw_dat_w);
   end generate;
-  no_reorder_no_seperate : if(g_fft.use_separate=false and g_fft.use_reorder=false) generate
-    raw_out_re  <= last_re;
-    raw_out_im  <= last_im;
+  no_reorder_no_seperate : if g_fft.use_separate=false and g_fft.use_reorder=false generate
+    -- c_raw_dat_w = g_fft.stage_dat_w because g_fft.use_separate = false
+    raw_out_re  <= data_re(0);
+    raw_out_im  <= data_im(0);
     raw_out_val <= data_val(0);
   end generate;  
@@ -255,7 +257,7 @@ begin
   u_requantize_re : entity common_lib.common_requantize
   generic map (
     g_representation      => "SIGNED",      
-    g_lsb_w               => c_out_scale_w + c_raw_dat_extra_w,
+    g_lsb_w               => c_out_scale_w + c_sepa_growth_w,
     g_lsb_round           => TRUE,           
     g_lsb_round_clip      => FALSE,      
     g_msb_clip            => FALSE,            
@@ -275,7 +277,7 @@ begin
   u_requantize_im : entity common_lib.common_requantize
   generic map (
     g_representation      => "SIGNED",      
-    g_lsb_w               => c_out_scale_w + c_raw_dat_extra_w,
+    g_lsb_w               => c_out_scale_w + c_sepa_growth_w,
     g_lsb_round           => TRUE,           
     g_lsb_round_clip      => FALSE,      
     g_msb_clip            => FALSE,            
diff --git a/libraries/dsp/fft/src/vhdl/fft_r2_wide.vhd b/libraries/dsp/fft/src/vhdl/fft_r2_wide.vhd
index da55a674b0..2490f5c6ab 100644
--- a/libraries/dsp/fft/src/vhdl/fft_r2_wide.vhd
+++ b/libraries/dsp/fft/src/vhdl/fft_r2_wide.vhd
@@ -153,18 +153,25 @@ architecture rtl of fft_r2_wide is
   constant c_out_scale_w      : integer := c_fft_r2_par.out_dat_w - g_fft.out_dat_w - g_fft.out_gain_w;  -- Estimate number of LSBs to throw away when > 0 or insert when < 0
+  constant c_sepa_growth_w    : natural := sel_a_b(g_fft.use_separate, 1, 0);  -- add one bit for add sub growth in separate
+  constant c_raw_dat_w        : natural := g_fft.stage_dat_w + c_sepa_growth_w;
+  -- g_fft.wb_factor = 1
+  signal fft_pipe_out_re      : std_logic_vector(g_fft.out_dat_w-1 downto 0);
+  signal fft_pipe_out_im      : std_logic_vector(g_fft.out_dat_w-1 downto 0);
+  -- g_fft.wb_factor > 1 and < g_fft.nof_points
   signal in_fft_pipe_re_arr   : t_fft_slv_arr(g_fft.wb_factor-1 downto 0);
   signal in_fft_pipe_im_arr   : t_fft_slv_arr(g_fft.wb_factor-1 downto 0);
   signal out_fft_pipe_re_arr  : t_fft_slv_arr(g_fft.wb_factor-1 downto 0);
   signal out_fft_pipe_im_arr  : t_fft_slv_arr(g_fft.wb_factor-1 downto 0);
+  signal out_fft_pipe_val     : std_logic_vector(g_fft.wb_factor-1 downto 0);
+  signal in_fft_par           : std_logic;  -- = out_fft_pipe_val(0)
   signal in_fft_par_re_arr    : t_fft_slv_arr(g_fft.wb_factor-1 downto 0);
   signal in_fft_par_im_arr    : t_fft_slv_arr(g_fft.wb_factor-1 downto 0);
-  signal fft_pipe_out_re      : std_logic_vector(g_fft.out_dat_w-1 downto 0);
-  signal fft_pipe_out_im      : std_logic_vector(g_fft.out_dat_w-1 downto 0);
   signal fft_out_re_arr       : t_fft_slv_arr(g_fft.wb_factor-1 downto 0);
   signal fft_out_im_arr       : t_fft_slv_arr(g_fft.wb_factor-1 downto 0);
   signal fft_out_val          : std_logic;                                   
@@ -173,11 +180,6 @@ architecture rtl of fft_r2_wide is
   signal sep_out_im_arr       : t_fft_slv_arr(g_fft.wb_factor-1 downto 0);   
   signal sep_out_val          : std_logic;                                   
-  signal int_val              : std_logic_vector(g_fft.wb_factor-1 downto 0);
-  signal out_cplx             : std_logic_vector(c_nof_complex*g_fft.stage_dat_w-1 downto 0);
-  signal in_cplx              : std_logic_vector(c_nof_complex*g_fft.stage_dat_w-1 downto 0);
   -- Default to fft_r2_pipe when g_fft.wb_factor=1
@@ -252,7 +254,7 @@ begin
         in_val    => in_val,
         out_re    => out_fft_pipe_re_arr(I)(c_fft_r2_pipe_arr(I).out_dat_w-1 downto 0),
         out_im    => out_fft_pipe_im_arr(I)(c_fft_r2_pipe_arr(I).out_dat_w-1 downto 0),
-        out_val   => int_val(I)
+        out_val   => out_fft_pipe_val(I)
     end generate;       
@@ -261,6 +263,8 @@ begin
+    in_fft_par <= out_fft_pipe_val(0);
     -- Create input for parallel FFT
     gen_inputs_for_par : for I in g_fft.wb_factor-1 downto 0 generate
       in_fft_par_re_arr(I) <= resize_fft_svec(out_fft_pipe_re_arr(I)(c_fft_r2_pipe_arr(I).out_dat_w-1 downto 0));
@@ -279,7 +283,7 @@ begin
       rst        => rst,
       in_re_arr  => in_fft_par_re_arr,
       in_im_arr  => in_fft_par_im_arr,
-      in_val     => int_val(0),
+      in_val     => in_fft_par,
       out_re_arr => fft_out_re_arr,
       out_im_arr => fft_out_im_arr,
       out_val    => fft_out_val
@@ -320,14 +324,14 @@ begin
       u_requantize_output_re : entity common_lib.common_requantize
       generic map (
         g_representation      => "SIGNED",      
-        g_lsb_w               => c_out_scale_w,               
+        g_lsb_w               => c_out_scale_w + c_sepa_growth_w,
         g_lsb_round           => TRUE,           
         g_lsb_round_clip      => FALSE,      
         g_msb_clip            => FALSE,            
         g_msb_clip_symmetric  => FALSE,  
         g_pipeline_remove_lsb => c_pipeline_remove_lsb, 
         g_pipeline_remove_msb => 0, 
-        g_in_dat_w            => g_fft.stage_dat_w,            
+        g_in_dat_w            => c_raw_dat_w,
         g_out_dat_w           => g_fft.out_dat_w
       port map (
@@ -340,14 +344,14 @@ begin
       u_requantize_output_im : entity common_lib.common_requantize
       generic map (
         g_representation      => "SIGNED",      
-        g_lsb_w               => c_out_scale_w,               
+        g_lsb_w               => c_out_scale_w + c_sepa_growth_w,
         g_lsb_round           => TRUE,           
         g_lsb_round_clip      => FALSE,      
         g_msb_clip            => FALSE,            
         g_msb_clip_symmetric  => FALSE,  
         g_pipeline_remove_lsb => c_pipeline_remove_lsb, 
         g_pipeline_remove_msb => 0, 
-        g_in_dat_w            => g_fft.stage_dat_w,            
+        g_in_dat_w            => c_raw_dat_w,
         g_out_dat_w           => g_fft.out_dat_w
       port map (
diff --git a/libraries/dsp/fft/src/vhdl/fft_reorder_sepa_pipe.vhd b/libraries/dsp/fft/src/vhdl/fft_reorder_sepa_pipe.vhd
index 89d056fb82..b363745caf 100644
--- a/libraries/dsp/fft/src/vhdl/fft_reorder_sepa_pipe.vhd
+++ b/libraries/dsp/fft/src/vhdl/fft_reorder_sepa_pipe.vhd
@@ -51,9 +51,9 @@ entity fft_reorder_sepa_pipe is
   port (
     clk     : in  std_logic;
     rst     : in  std_logic;
-    in_dat  : in  std_logic_vector;
+    in_dat  : in  std_logic_vector;  -- c_dat_w
     in_val  : in  std_logic;
-    out_dat : out std_logic_vector;
+    out_dat : out std_logic_vector;  -- c_dat_w when g_separate = false, else c_dat_w + 2
     out_val : out std_logic
 end entity fft_reorder_sepa_pipe;
@@ -323,9 +323,9 @@ begin
     port map (
       clk     => clk,
       rst     => rst,
-      in_dat  => out_dat_i, 
+      in_dat  => out_dat_i,  -- c_dat_w
       in_val  => out_val_i,
-      out_dat => out_dat,
+      out_dat => out_dat,    -- c_dat_w + 2
       out_val => out_val
   end generate;                             
@@ -335,8 +335,8 @@ begin
   -- the output signals are directly driven. 
   gen_no_separate : if g_separate=false generate
     rd_adr  <= TO_UVEC(r.count_up, c_adr_tot_w);
-    out_dat <= out_dat_i;
-    out_val <= out_val_i;                   
+    out_dat <= out_dat_i;  -- c_dat_w
+    out_val <= out_val_i;
   end generate;                                 
 end rtl;
diff --git a/libraries/dsp/fft/src/vhdl/fft_sepa.vhd b/libraries/dsp/fft/src/vhdl/fft_sepa.vhd
index 5bf2423a85..65da081cd7 100644
--- a/libraries/dsp/fft/src/vhdl/fft_sepa.vhd
+++ b/libraries/dsp/fft/src/vhdl/fft_sepa.vhd
@@ -41,17 +41,9 @@
 --              B.imag(m) = (X.real(N-m) - X.real(m))/2
 -- Remarks:
--- . The add and sub output of the separate have 1 bit growth that needs to be
---   rounded. Simply skipping 1 LSbit is not suitable, because it yields
---   asymmetry around 0 and thus a DC offset. For example for N = 3-bit data:
---              x =  -4 -3 -2 -1  0  1  2  3
---     round(x/2) =  -2 -2 -1 -1  0  1  1  2  = common_round for signed
---     floor(x/2) =  -2 -2 -1 -1  0  0  1  1  = truncation
---   The most negative value can be ignored:
---              x : mean(-3 -2 -1  0  1  2  3) = 0
---   . round(x/2) : mean(-2 -1 -1  0  1  1  2) = 0
---   . floor(x/2) : mean(-2 -1 -1  0  0  1  1) = -2/8 = -0.25 = -2^(N-1)/2 / 2^N
---   So the DC offset due to truncation is -0.25 LSbit, independent of N.
+-- . The A, B outputs are scaled by factor 2 due to separate add and sub.
+--   Therefore in_dat re, im have c_in_data_w bits and out_dat re, im have
+--   c_out_data_w = c_in_data_w + 1 bits, to avoid overflow.
 library IEEE, common_lib;
 use IEEE.std_logic_1164.ALL;
@@ -62,40 +54,40 @@ entity fft_sepa is
   port (
     clk     : in  std_logic;
     rst     : in  std_logic;
-    in_dat  : in  std_logic_vector;
+    in_dat  : in  std_logic_vector;   -- c_nof_complex * c_in_data_w
     in_val  : in  std_logic;
-    out_dat : out std_logic_vector;
+    out_dat : out std_logic_vector;   -- c_nof_complex * c_out_data_w = c_nof_complex * (c_in_data_w + 1)
     out_val : out std_logic
 end entity fft_sepa;
 architecture rtl of fft_sepa is                
-  constant c_sepa_round  : boolean := true;  -- must be true, because separate should round the 1 bit growth
-  constant c_data_w   : natural := in_dat'length/c_nof_complex;  
-  constant c_c_data_w : natural := c_nof_complex*c_data_w;
-  constant c_pipeline : natural := 3;
+  constant c_in_data_w     : natural := in_dat'length / c_nof_complex;
+  constant c_in_complex_w  : natural := c_nof_complex * c_in_data_w;
+  constant c_out_data_w    : natural := c_in_data_w  + 1;
+  constant c_out_complex_w : natural := c_nof_complex * c_out_data_w;
+  constant c_pipeline      : natural := 3;
-  type reg_type is record
-    switch    : std_logic;                                 -- Register used to toggle between A & B definitionn
-    val_dly   : std_logic_vector(c_pipeline-1 downto 0);   -- Register that delays the incoming valid signal
-    xn_m_reg  : std_logic_vector(c_c_data_w-1 downto 0);   -- Register to hold the X(N-m) value for one cycle
-    xm_reg    : std_logic_vector(c_c_data_w-1 downto 0);   -- Register to hold the X(m) value for one cycle
-    add_reg_a : std_logic_vector(c_data_w-1   downto 0);   -- Input register A for the adder
-    add_reg_b : std_logic_vector(c_data_w-1   downto 0);   -- Input register B for the adder
-    sub_reg_a : std_logic_vector(c_data_w-1   downto 0);   -- Input register A for the subtractor
-    sub_reg_b : std_logic_vector(c_data_w-1   downto 0);   -- Input register B for the subtractor
-    out_dat   : std_logic_vector(c_c_data_w-1 downto 0);   -- Registered output value
-    out_val   : std_logic;                                 -- Registered data valid signal  
+  type t_reg is record
+    switch    : std_logic;                                     -- Register used to toggle between A & B definitionn
+    val_dly   : std_logic_vector(c_pipeline-1 downto 0);       -- Register that delays the incoming valid signal
+    xn_m_reg  : std_logic_vector(c_in_complex_w-1 downto 0);   -- Register to hold the X(N-m) value for one cycle
+    xm_reg    : std_logic_vector(c_in_complex_w-1 downto 0);   -- Register to hold the X(m) value for one cycle
+    add_reg_a : std_logic_vector(c_in_data_w-1 downto 0);      -- Input register A for the adder
+    add_reg_b : std_logic_vector(c_in_data_w-1 downto 0);      -- Input register B for the adder
+    sub_reg_a : std_logic_vector(c_in_data_w-1 downto 0);      -- Input register A for the subtractor
+    sub_reg_b : std_logic_vector(c_in_data_w-1 downto 0);      -- Input register B for the subtractor
+    out_dat   : std_logic_vector(c_out_complex_w-1 downto 0);  -- Registered output value
+    out_val   : std_logic;                                     -- Registered data valid signal
   end record;
+  constant c_reg_init : t_reg := ('0', (others=>'0'), (others=>'0'), (others=>'0'), (others=>'0'), (others=>'0'), (others=>'0'), (others=>'0'), (others=>'0'), '0');
-  signal r, rin     : reg_type; 
-  signal sub_result : std_logic_vector(c_data_w downto 0); -- Result of the subtractor   
-  signal add_result : std_logic_vector(c_data_w downto 0); -- Result of the adder   
-  signal sub_result_q : std_logic_vector(c_data_w-1 downto 0); -- Requantized result of the subtractor   
-  signal add_result_q : std_logic_vector(c_data_w-1 downto 0); -- Requantized result of the adder
+  signal r          : t_reg := c_reg_init;
+  signal rin        : t_reg;
+  signal sub_result : std_logic_vector(c_out_data_w-1 downto 0); -- Result of the subtractor
+  signal add_result : std_logic_vector(c_out_data_w-1 downto 0); -- Result of the adder
@@ -108,8 +100,8 @@ begin
     g_representation  => "SIGNED",
     g_pipeline_input  => 0, 
     g_pipeline_output => 1, 
-    g_in_dat_w        => c_data_w,
-    g_out_dat_w       => c_data_w + 1
+    g_in_dat_w        => c_in_data_w,
+    g_out_dat_w       => c_out_data_w   -- = c_in_data_w + 1
   port map (
     clk     => clk,
@@ -124,8 +116,8 @@ begin
     g_representation  => "SIGNED",
     g_pipeline_input  => 0, 
     g_pipeline_output => 1, 
-    g_in_dat_w        => c_data_w,
-    g_out_dat_w       => c_data_w + 1
+    g_in_dat_w        => c_in_data_w,
+    g_out_dat_w       => c_out_data_w   -- = c_in_data_w + 1
   port map (
     clk     => clk,
@@ -134,52 +126,11 @@ begin
     result  => sub_result
-  gen_sepa_truncate : IF c_sepa_round=FALSE GENERATE
-    -- truncate the one LSbit
-    add_result_q <= add_result(c_data_w downto 1);
-    sub_result_q <= sub_result(c_data_w downto 1);
-  end generate;
-  gen_sepa_round : IF c_sepa_round=TRUE GENERATE
-    -- round the one LSbit
-    round_add : ENTITY common_lib.common_round
-      g_representation  => "SIGNED",  -- SIGNED (round +-0.5 away from zero to +- infinity) or UNSIGNED rounding (round 0.5 up to + inifinity)
-      g_round           => TRUE,      -- when TRUE round the input, else truncate the input
-      g_round_clip      => FALSE,     -- when TRUE clip rounded input >= +max to avoid wrapping to output -min (signed) or 0 (unsigned)
-      g_pipeline_input  => 0,         -- >= 0
-      g_pipeline_output => 0,         -- >= 0, use g_pipeline_input=0 and g_pipeline_output=0 for combinatorial output
-      g_in_dat_w        => c_data_w+1,
-      g_out_dat_w       => c_data_w
-    )
-    PORT MAP (
-      clk        => clk,
-      in_dat     => add_result,
-      out_dat    => add_result_q
-    );
-    round_sub : ENTITY common_lib.common_round
-      g_representation  => "SIGNED",  -- SIGNED (round +-0.5 away from zero to +- infinity) or UNSIGNED rounding (round 0.5 up to + inifinity)
-      g_round           => TRUE,      -- when TRUE round the input, else truncate the input
-      g_round_clip      => FALSE,     -- when TRUE clip rounded input >= +max to avoid wrapping to output -min (signed) or 0 (unsigned)
-      g_pipeline_input  => 0,         -- >= 0
-      g_pipeline_output => 0,         -- >= 0, use g_pipeline_input=0 and g_pipeline_output=0 for combinatorial output
-      g_in_dat_w        => c_data_w+1,
-      g_out_dat_w       => c_data_w
-    )
-    PORT MAP (
-      clk        => clk,
-      in_dat     => sub_result,
-      out_dat    => sub_result_q
-    );
-  end generate;
-  comb : process(r, rst, in_val, in_dat, add_result_q, sub_result_q)
-    variable v : reg_type;
+  comb : process(r, rst, in_val, in_dat, add_result, sub_result)
+    variable v : t_reg;
     v := r; 
@@ -188,7 +139,7 @@ begin
     v.val_dly(0) := in_val;
     -- Composition of the output registers:
-    v.out_dat := sub_result_q & add_result_q;
+    v.out_dat := sub_result & add_result;
     v.out_val := r.val_dly(c_pipeline-1);
     -- Compose the inputs for the adder and subtractor
@@ -196,16 +147,16 @@ begin
     if in_val = '1' or r.val_dly(0) = '1' then
       if r.switch = '0' then 
         v.xm_reg    := in_dat;
-        v.add_reg_a := r.xm_reg(c_c_data_w-1   downto c_data_w);  -- Xm   imag
-        v.add_reg_b := r.xn_m_reg(c_c_data_w-1 downto c_data_w);  -- Xn-m imag
-        v.sub_reg_a := r.xn_m_reg(c_data_w-1   downto 0);         -- Xn-m real
-        v.sub_reg_b := r.xm_reg(c_data_w-1     downto 0);         -- Xm   real
+        v.add_reg_a := r.xm_reg(c_in_complex_w-1   downto c_in_data_w);  -- Xm   imag
+        v.add_reg_b := r.xn_m_reg(c_in_complex_w-1 downto c_in_data_w);  -- Xn-m imag
+        v.sub_reg_a := r.xn_m_reg(c_in_data_w-1    downto 0);            -- Xn-m real
+        v.sub_reg_b := r.xm_reg(c_in_data_w-1      downto 0);            -- Xm   real
         v.xn_m_reg  := in_dat;
-        v.add_reg_a := r.xm_reg(c_data_w-1   downto 0);           -- Xm   real 
-        v.add_reg_b := in_dat(c_data_w-1     downto 0);           -- Xn-m real
-        v.sub_reg_a := r.xm_reg(c_c_data_w-1 downto c_data_w);    -- Xm   imag
-        v.sub_reg_b := in_dat(c_c_data_w-1   downto c_data_w);    -- Xn-m imag
+        v.add_reg_a := r.xm_reg(c_in_data_w-1    downto 0);              -- Xm   real
+        v.add_reg_b := in_dat(c_in_data_w-1      downto 0);              -- Xn-m real
+        v.sub_reg_a := r.xm_reg(c_in_complex_w-1 downto c_in_data_w);    -- Xm   imag
+        v.sub_reg_b := in_dat(c_in_complex_w-1   downto c_in_data_w);    -- Xn-m imag
       end if;
     end if;
@@ -213,16 +164,10 @@ begin
       v.switch := not r.switch;
     end if;
-    if(rst = '1') then
+    if rst = '1' then
+      -- Only need to reset the control signals
       v.switch    := '0';
       v.val_dly   := (others => '0');
-      v.xn_m_reg  := (others => '0');
-      v.xm_reg    := (others => '0');
-      v.add_reg_a := (others => '0');
-      v.add_reg_b := (others => '0');
-      v.sub_reg_a := (others => '0');
-      v.sub_reg_b := (others => '0');
-      v.out_dat   := (others => '0');
       v.out_val   := '0';
     end if;
diff --git a/libraries/dsp/fft/src/vhdl/fft_sepa_wide.vhd b/libraries/dsp/fft/src/vhdl/fft_sepa_wide.vhd
index a950206d5e..0af4993aee 100644
--- a/libraries/dsp/fft/src/vhdl/fft_sepa_wide.vhd
+++ b/libraries/dsp/fft/src/vhdl/fft_sepa_wide.vhd
@@ -67,11 +67,17 @@ architecture rtl of fft_sepa_wide is
   constant c_page_size   : natural := g_fft.nof_points/g_fft.wb_factor;    -- Size of the memories
   constant c_nof_pages   : natural := 2;                                   -- The number of pages in each ram. 
-  constant c_dat_w       : natural := c_nof_complex*g_fft.stage_dat_w;     -- Data width for the internal vectors where real and imag are combined. 
+  constant c_in_w        : natural := g_fft.stage_dat_w;
+  constant c_dat_w       : natural := c_nof_complex*c_in_w;                -- Data width for the internal vectors where real and imag are combined.
   constant c_adr_w       : natural := ceil_log2(c_page_size);              -- Address width of the rams
   constant c_nof_streams : natural := 2;                                   -- Number of inputstreams for the zip units
-  type   t_dat_arr       is array(integer range <> ) of std_logic_vector(c_dat_w-1 downto 0); 
+  constant c_sepa_growth_w      : natural := sel_a_b(g_fft.use_separate, 1, 0);  -- add one bit for add sub growth in separate
+  constant c_out_w              : natural := c_in_w + c_sepa_growth_w;
+  constant c_raw_dat_w          : natural := c_nof_complex*c_out_w;  -- = c_dat_w or c_dat_w + 2
+  type   t_dat_arr       is array(integer range <> ) of std_logic_vector(c_dat_w-1 downto 0);
+  type   t_raw_dat_arr   is array(integer range <> ) of std_logic_vector(c_raw_dat_w-1 downto 0);
   type   t_rd_adr_arr    is array(integer range <> ) of std_logic_vector(c_adr_w-1 downto 0);
   type   t_zip_in_matrix is array(integer range <> ) of t_slv_64_arr(1 downto 0);             -- Every Zip unit has two inputs. 
@@ -85,24 +91,27 @@ architecture rtl of fft_sepa_wide is
   signal zip_in_matrix   : t_zip_in_matrix(g_fft.wb_factor-1 downto 0);  -- Matrix that contains the inputs for zip units
   signal zip_in_val      : std_logic_vector(g_fft.wb_factor-1 downto 0); -- Vector that holds the data input valids for the zip units
-  signal zip_out_dat_arr : t_dat_arr(g_fft.wb_factor-1 downto 0);        -- Array that holds the outputs of all zip units. 
+  signal zip_out_dat_arr : t_dat_arr(g_fft.wb_factor-1 downto 0);        -- Array that holds the outputs of all zip units.
   signal zip_out_val     : std_logic_vector(g_fft.wb_factor-1 downto 0); -- Vector that holds the output valids of the zip units
-  signal sep_out_dat_arr : t_dat_arr(g_fft.wb_factor-1 downto 0);        -- Array that holds the outputs of the separation blocks
+  signal sep_out_dat_arr : t_raw_dat_arr(g_fft.wb_factor-1 downto 0);    -- Array that holds the outputs of the separation blocks
   signal sep_out_val_vec : std_logic_vector(g_fft.wb_factor-1 downto 0); -- Vector containing the datavalids from the separation blocks
-  signal out_dat_arr     : t_dat_arr(g_fft.wb_factor-1 downto 0);        -- Array that holds the ouput values, where real and imag are concatenated 
+  signal out_dat_arr     : t_raw_dat_arr(g_fft.wb_factor-1 downto 0);    -- Array that holds the ouput values, where real and imag are concatenated
-  type state_type is (s_idle, s_read); 
-  type reg_type   is record
+  type t_state is (s_idle, s_read);
+  type t_reg   is record
     switch      : std_logic;   -- Toggle register used for separate functionalilty
     count_up    : natural range 0 to c_page_size; -- An upwards counter for read addressing
     count_down  : natural range 0 to c_page_size; -- A downwards counter for read addressing
     val_odd     : std_logic;   -- Register that drives the in_valid of the odd zip units
     val_even    : std_logic;   -- Register that drives the in_valid of the even zip units
-    state       : state_type;  -- The state machine. 
+    state       : t_state;     -- The state machine.
   end record;
-  signal r, rin : reg_type;   
+  constant c_reg_init : t_reg := ('0', 0, 0, '0', '0', s_idle);
+  signal r          : t_reg := c_reg_init;
+  signal rin        : t_reg;
@@ -111,7 +120,7 @@ begin
   -- Prepare the data for the dual paged memory. Real and imaginary part are concatenated into one vector. 
   gen_prep_write_data : for I in 0 to g_fft.wb_factor-1 generate 
-    wr_dat(I) <= in_im_arr(I)(g_fft.stage_dat_w-1 downto 0) & in_re_arr(I)(g_fft.stage_dat_w-1 downto 0);
+    wr_dat(I) <= in_im_arr(I)(c_in_w-1 downto 0) & in_re_arr(I)(c_in_w-1 downto 0);
   end generate;
   -- Prepare the write control signals for the memories. 
@@ -204,9 +213,9 @@ begin
     port map (
       clk     => clk,
       rst     => rst,
-      in_dat  => zip_out_dat_arr(I), 
+      in_dat  => zip_out_dat_arr(I),  -- c_dat_w
       in_val  => zip_out_val(I),
-      out_dat => sep_out_dat_arr(I),
+      out_dat => sep_out_dat_arr(I),  -- c_dat_w + 2
       out_val => sep_out_val_vec(I)
   end generate; 
@@ -218,13 +227,13 @@ begin
   -- the fellow toggle signals. It also controls the starting and stopping 
   -- of the data stream. 
   comb : process(r, rst, next_page)
-    variable v : reg_type;
+    variable v : t_reg;
     v := r; 
     case r.state is
-	    when s_idle =>      
+      when s_idle =>
         v.switch     := '0';
         v.val_odd    := '0';
         v.val_even   := '0';
@@ -234,7 +243,7 @@ begin
           v.state    := s_read; 
         end if;
-	    when s_read =>    
+      when s_read =>
         if(r.switch = '0') then                -- Toggle the switch register from 0 to 1
           v.switch   := '1';
         end if; 
@@ -255,22 +264,17 @@ begin
         v.val_odd  := r.switch;                -- Assignment of the odd and even markers
         v.val_even := not(r.switch);
-	    when others =>
-	  	  v.state := s_idle;
+      when others =>
+        v.state := s_idle;
-	  end case;
+    end case;
-    if(rst = '1') then 
-      v.switch     := '0';
-      v.count_up   := 0;
-      v.count_down := 0;
-      v.val_odd    := '0';
-      v.val_even   := '0';
-      v.state      := s_idle;
+    if rst = '1' then
+      v := c_reg_init;
     end if;
     rin <= v;  
   end process comb;
   regs : process(clk)
@@ -287,8 +291,8 @@ begin
     u_output_pipeline_align : entity common_lib.common_pipeline
     generic map (
       g_pipeline  => c_pipeline_output + 1,                       -- Pipeline + one stage for allignment
-      g_in_dat_w  => c_dat_w,
-      g_out_dat_w => c_dat_w
+      g_in_dat_w  => c_raw_dat_w,
+      g_out_dat_w => c_raw_dat_w
     port map (
       clk     => clk,
@@ -299,8 +303,8 @@ begin
     u_output_pipeline : entity common_lib.common_pipeline
     generic map (
       g_pipeline  => c_pipeline_output,                           -- Only pipeline stage
-      g_in_dat_w  => c_dat_w,
-      g_out_dat_w => c_dat_w
+      g_in_dat_w  => c_raw_dat_w,
+      g_out_dat_w => c_raw_dat_w
     port map (
       clk     => clk,
@@ -321,8 +325,8 @@ begin
   -- Split the concatenated array into a real and imaginary array for the output
   gen_output_arrays : for I in g_fft.wb_factor-1 downto 0 generate
-    out_re_arr(I) <= resize_fft_svec(out_dat_arr(I)(              g_fft.stage_dat_w-1 downto                 0));
-    out_im_arr(I) <= resize_fft_svec(out_dat_arr(I)(c_nof_complex*g_fft.stage_dat_w-1 downto g_fft.stage_dat_w));
+    out_re_arr(I) <= resize_fft_svec(out_dat_arr(I)(              c_out_w-1 downto       0));
+    out_im_arr(I) <= resize_fft_svec(out_dat_arr(I)(c_nof_complex*c_out_w-1 downto c_out_w));
   end generate; 
 end rtl;