diff --git a/libraries/dsp/rTwoSDF/src/vhdl/rTwoSDFPkg.vhd b/libraries/dsp/rTwoSDF/src/vhdl/rTwoSDFPkg.vhd
index 79c1bcff2dd8caddf9c9978b336c9b5b5d71dacf..6b88465f4311e4a8ed57f84dd0c84d9ecf4d7346 100644
--- a/libraries/dsp/rTwoSDF/src/vhdl/rTwoSDFPkg.vhd
+++ b/libraries/dsp/rTwoSDF/src/vhdl/rTwoSDFPkg.vhd
@@ -30,7 +30,7 @@ package rTwoSDFPkg is
     -- generics for rTwoSDFStage
     stage_lat      : natural;  -- = 1
     weight_lat     : natural;  -- = 1
-    mul_lat        : natural;  -- = 3
+    mul_lat        : natural;  -- = 3+1
     -- generics for rTwoBFStage
     bf_lat         : natural;  -- = 1
     -- generics for rTwoBF
@@ -41,7 +41,7 @@ package rTwoSDFPkg is
     sep_lat        : natural;  -- = 1
   end record;
   
-  constant c_fft_pipeline   : t_fft_pipeline := (1, 1, 3, 1, 1, 0, 0, 1);
+  constant c_fft_pipeline   : t_fft_pipeline := (1, 1, 4, 1, 1, 0, 0, 1);
   
 end package rTwoSDFPkg;
 
diff --git a/libraries/dsp/rTwoSDF/src/vhdl/rTwoWMul.vhd b/libraries/dsp/rTwoSDF/src/vhdl/rTwoWMul.vhd
index 67247a2939212e91d504d075ad714e23163d56c7..de0b44de498d7f3c63e161e2be0ace40c08538cf 100644
--- a/libraries/dsp/rTwoSDF/src/vhdl/rTwoWMul.vhd
+++ b/libraries/dsp/rTwoSDF/src/vhdl/rTwoWMul.vhd
@@ -28,7 +28,7 @@ entity rTwoWMul is
   generic (
     g_technology : NATURAL := c_tech_select_default;
     g_stage      : natural := 1;
-    g_lat        : natural := 3
+    g_lat        : natural := 3+1       -- 3 for mult, 1 for round
   );
   port (
     clk       : in  std_logic;
@@ -47,16 +47,32 @@ end entity rTwoWMul;
 
 architecture str of rTwoWMul is
 
-  -- Derive the common_complex_mult g_pipeline_* values from g_lat. The sum must match g_lat.
-  constant c_mult_input_lat   : natural := sel_a_b(g_lat>1, 1, 0);                     -- second priority use DSP pipeline input
-  constant c_mult_product_lat : natural := 0;
-  constant c_mult_adder_lat   : natural := sel_a_b(g_lat>2, 1, 0);                     -- third priority use DSP internal product-sum pipeline
-  constant c_mult_extra_lat   : natural := sel_a_b(g_lat>3, g_lat-3, 0);               -- remaining extra pipelining in logic
-  constant c_mult_output_lat  : natural := sel_a_b(g_lat>0, 1, 0) + c_mult_extra_lat;  -- first priority use DSP pipeline output
+  -- Use multiplier product truncate or signed rounding (= away from zero). On hardware for Fsub in
+  -- Apertif and using the WG at various frequencies at subband or between subbands it appears that
+  -- using truncate or sround does not make a noticable difference in the SST. Still choose to use
+  -- signed rounding to preserve zero DC.
+  constant c_use_truncate     : boolean := true; --false;
+  
+  -- Derive the common_complex_mult g_pipeline_* values from g_lat. The sum c_total_lat = g_lat, so that g_lat defines
+  -- the total latency from in_* to out_*.
   
   -- DSP multiplier IP
-  constant c_dsp_mult_lat     : natural:= 3;
+  constant c_dsp_mult_lat     : natural := 3;
 
+  -- Pipeline multiplier product rounding from c_prod_w via c_round_w to c_out_dat_w
+  constant c_round_lat        : natural := sel_a_b(g_lat > c_dsp_mult_lat, 1, 0);  -- allocate 1 pipeline for round
+  constant c_lat              : natural := g_lat - c_round_lat;                    -- allocate remaining pipeline to multiplier
+  
+  constant c_mult_input_lat   : natural := sel_a_b(c_lat>1, 1, 0);                     -- second priority use DSP pipeline input
+  constant c_mult_product_lat : natural := 0;
+  constant c_mult_adder_lat   : natural := sel_a_b(c_lat>2, 1, 0);                     -- third priority use DSP internal product-sum pipeline
+  constant c_mult_extra_lat   : natural := sel_a_b(c_lat>3, c_lat-3, 0);               -- remaining extra pipelining in logic
+  constant c_mult_output_lat  : natural := sel_a_b(c_lat>0, 1, 0) + c_mult_extra_lat;  -- first priority use DSP pipeline output
+  constant c_mult_lat         : natural := c_mult_input_lat + c_mult_product_lat + c_mult_adder_lat + c_mult_output_lat;
+  
+  -- Total input to output latency
+  constant c_total_lat   : natural := c_mult_lat + c_round_lat;
+  
   -- Quantization
   constant c_in_dat_w         : natural:= in_re'length;
   constant c_weight_w         : natural:= weight_re'length;
@@ -74,6 +90,11 @@ architecture str of rTwoWMul is
 
 begin
   
+  -- Total latency check
+  ASSERT c_total_lat = g_lat
+    REPORT "rTwoWMul: total pipeline error"
+    SEVERITY FAILURE;
+  
   ------------------------------------------------------------------------------
   -- Complex multiplication
   -- . use the common_complex_mult(rtl) for the output stage 1 because then
@@ -81,11 +102,11 @@ begin
   --   weight_re = 1 and weight_im = 0 inputs.
   -- . the IP in common_complex_mult(stratix4) only supports up to 18b wide
   --   inputs.
-  --   . for g_lat = 0,1,2 use the RTL multiplier
-  --   . for g_lat >= 3 default best use the FPGA multiplier IP block.
+  --   . for c_lat = 0,1,2 use the RTL multiplier
+  --   . for c_lat >= 3 default best use the FPGA multiplier IP block.
   ------------------------------------------------------------------------------
 
-  gen_rtl : if g_stage=1 or c_in_dat_w>c_dsp_mult_w or g_lat<c_dsp_mult_lat generate
+  gen_rtl : if g_stage=1 or c_in_dat_w>c_dsp_mult_w or c_lat<c_dsp_mult_lat generate
     u_CmplxMul : entity common_mult_lib.common_complex_mult
     generic map (
       g_technology       => g_technology,
@@ -109,11 +130,11 @@ begin
       in_val    => in_val,
       out_pr    => product_re,
       out_pi    => product_im,
-      out_val   => out_val
+      out_val   => OPEN
     );
   end generate;
   
-  gen_ip : if g_stage>1 and c_in_dat_w<=c_dsp_mult_w and g_lat>=c_dsp_mult_lat generate
+  gen_ip : if g_stage>1 and c_in_dat_w<=c_dsp_mult_w and c_lat>=c_dsp_mult_lat generate
     u_cmplx_mul : entity common_mult_lib.common_complex_mult
     generic map (
       g_technology       => g_technology,
@@ -137,7 +158,7 @@ begin
       in_val    => in_val,
       out_pr    => product_re,
       out_pi    => product_im,
-      out_val   => out_val
+      out_val   => OPEN
     );
   end generate;
 
@@ -145,20 +166,40 @@ begin
   -- Round WMult output
   ------------------------------------------------------------------------------
   
-  -- use truncate    that throws away the c_round_w lower bits as rounding function
-  -- use resize_svec that keeps the c_out_dat_w lower bits to get to the output width
-  round_re <= truncate_and_resize_svec(product_re, c_round_w, c_out_dat_w);
-  round_im <= truncate_and_resize_svec(product_im, c_round_w, c_out_dat_w);
-
-  -- output real and imaginary, switch between input and product
-  out_re <= round_re when out_sel = '1' else in_re_dly;
-  out_im <= round_im when out_sel = '1' else in_im_dly;
+  gen_truncate : if c_use_truncate=true GENERATE
+    -- use truncate    that throws away the c_round_w lower bits as rounding function
+    -- use resize_svec that keeps the c_out_dat_w lower bits to get to the output width
+    gen_comb : if c_round_lat=0 generate
+      round_re <= truncate_and_resize_svec(product_re, c_round_w, c_out_dat_w);
+      round_im <= truncate_and_resize_svec(product_im, c_round_w, c_out_dat_w);
+    end generate;
+    gen_reg : if c_round_lat=1 generate
+      round_re <= truncate_and_resize_svec(product_re, c_round_w, c_out_dat_w) when rising_edge(clk);
+      round_im <= truncate_and_resize_svec(product_im, c_round_w, c_out_dat_w) when rising_edge(clk);
+    end generate;
+  end generate;
+    
+  
+  gen_sround : if c_use_truncate=false GENERATE
+    -- Use resize_svec(s_round()) instead of truncate_and_resize_svec() to have symmetrical rounding around 0
+    -- Rounding takes logic due to adding 0.5 therefore need to use c_round_lat=1 to achieve timing
+    gen_comb : if c_round_lat=0 generate
+      ASSERT false REPORT "rTwoWMul: can probably not achieve timing for sround without pipeline" SEVERITY FAILURE;
+      round_re <= RESIZE_SVEC(s_round(product_re, c_round_w), c_out_dat_w);
+      round_im <= RESIZE_SVEC(s_round(product_im, c_round_w), c_out_dat_w);
+    end generate;
+    gen_reg : if c_round_lat=1 generate
+      round_re <= RESIZE_SVEC(s_round(product_re, c_round_w), c_out_dat_w) when rising_edge(clk);
+      round_im <= RESIZE_SVEC(s_round(product_im, c_round_w), c_out_dat_w) when rising_edge(clk);
+    end generate;
+  end generate;
 
 
   ------------------------------------------------------------------------------
   -- Propagate data and control signals for input/output choice at WMult output
   ------------------------------------------------------------------------------
-  
+
+  -- No need to use rst for data, because initial data value is don't care
   u_re_lat : entity common_lib.common_pipeline
   generic map (
     g_pipeline  => g_lat,
@@ -183,14 +224,33 @@ begin
     out_dat => in_im_dly
   );
 
+  -- Use rst for control to ensure initial low
   u_sel_lat : entity common_lib.common_pipeline_sl
   generic map (
     g_pipeline => g_lat
   )
   port map (
+    rst     => rst,
     clk     => clk,
     in_dat  => in_sel,
     out_dat => out_sel
   );
 
+  u_pipeline_out_val : entity common_lib.common_pipeline_sl
+  generic map (
+    g_pipeline  => g_lat
+  )
+  port map (
+    rst     => rst,
+    clk     => clk,
+    in_dat  => in_val,
+    out_dat => out_val
+  );
+
+  ------------------------------------------------------------------------------
+  -- Output real and imaginary, switch between input and product
+  ------------------------------------------------------------------------------
+  out_re <= round_re when out_sel = '1' else in_re_dly;
+  out_im <= round_im when out_sel = '1' else in_im_dly;
+
 end str;
\ No newline at end of file
diff --git a/libraries/dsp/rTwoSDF/tb/vhdl/tb_rTwoSDF.vhd b/libraries/dsp/rTwoSDF/tb/vhdl/tb_rTwoSDF.vhd
index 7e113d74d9c937224097f155c7357cf846f2b092..a03663c51fb0056f341b11fb19774159d0bd5d62 100644
--- a/libraries/dsp/rTwoSDF/tb/vhdl/tb_rTwoSDF.vhd
+++ b/libraries/dsp/rTwoSDF/tb/vhdl/tb_rTwoSDF.vhd
@@ -84,22 +84,7 @@ entity tb_rTwoSDF is
     g_nof_points        : natural  := 1024;
     g_in_dat_w          : natural  := 8;   
     g_out_dat_w         : natural  := 14;   
-    g_guard_w           : natural  := 2;     -- guard bits are used to avoid overflow in single FFT stage.   
-
-    -- Internal pipeline settings for rTwoSDF
-    g_pipeline : t_fft_pipeline := (1, 1, 3, 1, 1, 0, 0, 1)  -- type t_rtwo_sdf_stage_pipeline is record
-                                                             --   -- generics for rTwoSDFStage
-                                                             --   stage_lat      : natural;  -- = 1
-                                                             --   weight_lat     : natural;  -- = 1
-                                                             --   mul_lat        : natural;  -- = 3
-                                                             --   -- generics for rTwoBFStage
-                                                             --   bf_lat         : natural;  -- = 1
-                                                             --   -- generics for rTwoBF
-                                                             --   bf_use_zdly    : natural;  -- = 1
-                                                             --   bf_in_a_zdly   : natural;  -- = 0
-                                                             --   bf_out_d_zdly  : natural;  -- = 0
-                                                             --   sep_lat        : natural;  -- = 1
-                                                             -- end record;
+    g_guard_w           : natural  := 2      -- guard bits are used to avoid overflow in single FFT stage.
   );
 end entity tb_rTwoSDF;
 
@@ -269,9 +254,7 @@ begin
     g_out_dat_w   => g_out_dat_w, 
     g_stage_dat_w => c_stage_dat_w,
     g_guard_w     => g_guard_w,
-    g_nof_points  => g_nof_points,
-    -- generics for rTwoSDFStage
-    g_pipeline    => g_pipeline
+    g_nof_points  => g_nof_points
   )
   port map(
     clk       => clk,
diff --git a/libraries/dsp/rTwoSDF/tb/vhdl/tb_tb_rTwoSDF.vhd b/libraries/dsp/rTwoSDF/tb/vhdl/tb_tb_rTwoSDF.vhd
index 8734636a2db1dc11d0432cc332f454d14a267b8b..1efbecad26bbfb1f8a876bded1c7b4e24b09bf7c 100644
--- a/libraries/dsp/rTwoSDF/tb/vhdl/tb_tb_rTwoSDF.vhd
+++ b/libraries/dsp/rTwoSDF/tb/vhdl/tb_tb_rTwoSDF.vhd
@@ -52,26 +52,11 @@ begin
 --  g_nof_points        : natural  := 1024;
 --  g_in_dat_w          : natural  := 8;   
 --  g_out_dat_w         : natural  := 14;   
---  g_guard_w           : natural  := 2;     -- guard bits are used to avoid overflow in single FFT stage.   
---
---  -- Internal pipeline settings for rTwoSDF
---  g_pipeline : t_fft_pipeline := (1, 1, 3, 1, 1, 0, 0, 1)  -- type t_rtwo_sdf_stage_pipeline is record
---                                                           --   -- generics for rTwoSDFStage
---                                                           --   stage_lat      : natural;  -- = 1
---                                                           --   weight_lat     : natural;  -- = 1
---                                                           --   mul_lat        : natural;  -- = 3
---                                                           --   -- generics for rTwoBFStage
---                                                           --   bf_lat         : natural;  -- = 1
---                                                           --   -- generics for rTwoBF
---                                                           --   bf_use_zdly    : natural;  -- = 1
---                                                           --   bf_in_a_zdly   : natural;  -- = 0
---                                                           --   bf_out_d_zdly  : natural;  -- = 0
---                                                           --   sep_lat        : natural;  -- = 1
---                                                           -- end record;
+--  g_guard_w           : natural  := 2      -- guard bits are used to avoid overflow in single FFT stage.   
 
-  --u_act_impulse_16p_16i_16o         : entity work.tb_rTwoSDF generic map (false, 1,  true,   16, 16, 16, 2, (1, 1, 3, 1, 1, 0, 0, 1));
-  u_act_noise_1024p_8i_14o          : entity work.tb_rTwoSDF generic map (true,  1,  true, 1024,  8, 14, 2, (1, 1, 3, 1, 1, 0, 0, 1));
-  u_rnd_noise_1024p_8i_14o          : entity work.tb_rTwoSDF generic map (true,  0,  true, 1024,  8, 14, 2, (1, 1, 3, 1, 1, 0, 0, 1));
-  u_rnd_noise_1024p_8i_14o_flipped  : entity work.tb_rTwoSDF generic map (true,  0, false, 1024,  8, 14, 2, (1, 1, 3, 1, 1, 0, 0, 1));
+  --u_act_impulse_16p_16i_16o         : entity work.tb_rTwoSDF generic map (false, 1,  true,   16, 16, 16, 2);
+  u_act_noise_1024p_8i_14o          : entity work.tb_rTwoSDF generic map (true,  1,  true, 1024,  8, 14, 2);
+  u_rnd_noise_1024p_8i_14o          : entity work.tb_rTwoSDF generic map (true,  0,  true, 1024,  8, 14, 2);
+  u_rnd_noise_1024p_8i_14o_flipped  : entity work.tb_rTwoSDF generic map (true,  0, false, 1024,  8, 14, 2);
   
 end tb;