diff --git a/applications/apertif/designs/apertif_unb1_correlator/tb/vhdl/tb_node_apertif_unb1_correlator_processing_output.vhd b/applications/apertif/designs/apertif_unb1_correlator/tb/vhdl/tb_node_apertif_unb1_correlator_processing_output.vhd
index 83ed3cce2078cab8fa019befa754244b1a0583db..045218773f9601de6fca91212d598c491a79b285 100644
--- a/applications/apertif/designs/apertif_unb1_correlator/tb/vhdl/tb_node_apertif_unb1_correlator_processing_output.vhd
+++ b/applications/apertif/designs/apertif_unb1_correlator/tb/vhdl/tb_node_apertif_unb1_correlator_processing_output.vhd
@@ -73,6 +73,44 @@
 --   With no inter channel output delay, channels are output back-to-back, creating a short full valid burst of 300 visibilities
 --   per channel for 64 channels per beamlet. This yields nof_complex * cor_out_dat_w * f_clk = 2 * 32 * 200M = 12.8 Gbps in burst.
 --
+-- * Set c_inter_channel_delay and g_bg_duty_cycle_gap in simulation with 1 PN and 1 GbE link:
+--   In the tb with c_nof_tp=2*2=4 there are c_nof_visibilities=10 visibilities, g_nof_beamlets=2 and c_nof_channels=64. Hence the
+--   number of output bits per BG sync interval is:
+--
+--     nof_complex * cor_out_dat_w * c_nof_visibilities * c_nof_channels * g_nof_beamlets = 2 * 32 * 10 * 64 * 2
+--                                                                                        = 81920 bits = 1280 complex words
+--
+--   The BG sync interval in the tb takes c_bg_nof_clk_per_sync = c_nof_blocks_per_sync * c_bg_block_period = 960 * 4 = 3840 clk
+--   cycles, so g_bg_duty_cycle_gap=0 can be used, because 3840 > 1280 at the cor_src_out sosi.re/im interface.
+--   The data rate at the udp_tx_snk_in_arr(0).re/im interface is 125MHz * 32b = 4 Gbps. However at the 1GbE serial PHY interface
+--   the burst data rate must remain < 1Gbps. Per channel there are:
+--
+--     gap size     = c_network_eth_gap_len * 8b = 96 bits for some idle time between packets
+--     header size  = 21 * 32b = 672 bits
+--     tail size    = 1 * 32b = 32 bits
+--     payload size = nof_complex * cor_out_dat_w * c_nof_visibilities = 2 * 32 * 10 = 640 bits
+--     packet size  = gap size + header size + payload size + tail size = 96 + 672 + 640 + 32 = 1440 bits
+--
+--   The transport over 1GbE takes at least 1312 ns @ 1Gbps. One dp_clk cycle is 5 ns (= 1/f_clk = 1/200M), so 1440 / 5 = 288
+--   dp_clk cycles. The visibilities arrive in c_nof_visibilities=10 dp_clk cycles in cor_src_out. Therefore:
+--
+--     c_inter_channel_delay must be > 288 - c_nof_visibilities = 288 - 10 = 278 --> 279
+--
+--   However with an c_inter_channel_delay the total offload time per sync interval becomes about:
+--
+--     c_inter_channel_delay * c_nof_channels * g_nof_beamlets = 279 * 64 * 2 = 35712 dp_clk cycles
+--
+--   This is > BG sync interval of 3840 dp_clk cycles with g_bg_duty_cycle_gap=0. Therefore g_bg_duty_cycle_gap needs to be
+--   increased to:
+--
+--     c_bg_block_period = c_bg_nof_clk_per_sync / c_nof_blocks_per_sync = 35712 / 640 = 55.8 --> 56
+--     g_bg_duty_cycle_gap = c_bg_block_period - c_frame_size * c_interleave_factor = 56 - 2*2 = 52
+--
+--   Therefore in combination with c_inter_channel_delay=279 use g_bg_duty_cycle_gap >= 52 to make it work for 1GbE @ 1Gbps.
+--   To make it work with N_PN = 128 nodes and 2 * 10GbE links to the data writer the c_inter_channel_delay needs to be larger
+--   to reduce the rate per 1GbE link to < 20 Gbps / 128 = 156.25 Mbps.
+--
+-- * Set c_inter_channel_delay on hardware with 128 PN and 2 10GbE links:
 --   In total there are nof_beamlets * nof_channels * nof_visibilities per t_int = 1.024 sec, so 88 * 64 * 300 * 2 * 32b / 1.024s =
 --   105.6 Mbps on average (8 bit mode with 88 beamlets) or 144 Mbps (6 bit mode with 120 beamlets), which can run over one 1GbE
 --   link. The total Apertif correlator output rate for N_PN = 128 nodes to the Apertif data writer is 128 * 105.6M = 13.5168 Gbps
@@ -80,9 +118,9 @@
 --
 --   We need to set an appropriate number of c_inter_channel_delay cycles for a constant visibility buffer output rate. The
 --   correlator outputs nof_beamlets * nof_channels = 88 * 64 = 5632 respectively 120 * 64 = 7680 blocks of nof_visibilities =
---   300 visibility samples per t_int = N_int / f_sub = 1.024 s = 204.8M clk cycles @ f_clk = 200 MHz. Hence per block there
---   are maximum 36363 or 26666 clk cycles available. The block itself takes nof_visibilities = 300 clk cycles. Hence the
---   maximum c_inter_channel_delay becomes 36363 - 300 = 36063 or 26666 - 300 = 26066 clk cycles.
+--   300 visibility samples per t_int = N_int / f_sub = 1.024 s = 204.8M dp_clk cycles @ f_clk = 200 MHz. Hence per block there
+--   are maximum 36363 or 26666 dp_clk cycles available. The block itself takes nof_visibilities = 300 dp_clk cycles. Hence the
+--   maximum c_inter_channel_delay becomes 36363 - 300 = 36063 or 26666 - 300 = 26066 dp_clk cycles.
 --   The minimum c_inter_channel_delay depends on the capacity of the two 10GbE links. Using the total Apertif correlator output 
 --   this yields about 13.5168G/20GbE * max(c_inter_channel_delay) = 24373 for 88 beamlets respectively 24022 for 120 beamlets.
 --
@@ -128,13 +166,14 @@
 --   . u_dut/wpfb_src_out_arr(0).re/im, radix decimal
 --   . cor_src_out.re/im
 
-LIBRARY IEEE, common_lib, mm_lib, unb1_board_lib, dp_lib, diag_lib, wpfb_lib;
+LIBRARY IEEE, common_lib, mm_lib, unb1_board_lib, dp_lib, diag_lib, wpfb_lib, eth_lib;
 USE IEEE.std_logic_1164.ALL;
 USE IEEE.numeric_std.ALL;
 USE IEEE.math_real.ALL;
 USE common_lib.common_pkg.ALL;
 USE common_lib.common_math_pkg.ALL;
 USE common_lib.common_mem_pkg.ALL;
+USE common_lib.common_network_layers_pkg.ALL;
 USE common_lib.tb_common_pkg.ALL;
 USE common_lib.tb_common_mem_pkg.ALL;
 USE mm_lib.mm_file_pkg.ALL;
@@ -147,25 +186,34 @@ USE wpfb_lib.wpfb_pkg.ALL;
 
 ENTITY tb_node_apertif_unb1_correlator_processing_output IS
   GENERIC (
+    g_tb_end                : BOOLEAN := TRUE;   -- when TRUE then tb_end ends this simulation, else a higher multi-testbench will end the simulation
+    g_tb_index              : NATURAL := 0;      -- use different index to avoid MM file conflict in multi tb
+    g_nof_bg_sync           : NATURAL := 2;   -- [t] number of BG sync intervals, is number ofintegration intervals
     -- DUT settings
     g_nof_pn                : NATURAL := 2;   -- number of PN, choose >= 1
     g_nof_10G               : NATURAL := 2;   -- number of 10G input per PN, choose >= 2 and even (to fit c_interleave_factor=2)
     g_nof_beamlets          : NATURAL := 2;   -- [bu_i], is 88 in 8bit, 120 in 6bit beamlet mode
     g_nof_tchan_per_sync    : NATURAL := 10;  -- [t_c], is 12500, choose > number of taps of WPFB to simulate longer than the FIR impulse response
-    g_inter_channel_delay   : NATURAL := 1;   -- throttle correlator output
+    g_inter_channel_delay   : NATURAL := 279;   -- throttle correlator output
+    g_use_wpfb              : BOOLEAN := TRUE;
     g_use_prefilter         : BOOLEAN := TRUE;
     g_use_prefilter_ones    : BOOLEAN := FALSE;
-    -- Input settings
-    g_nof_sync              : NATURAL := 3;   -- [t] number of sync intervals, is number ofintegration intervals
+    -- BG settings
+    g_bg_duty_cycle_gap     : NATURAL := 56;   -- 0 or small for faster simulation, use N_clk - Q_interleave*g_nof_beamlets for HW
     g_phasor_ampl           : REAL := 0.25;   -- range 0:1 where by 1.0 corresponds to maximum +amplitude, use <= 1.0 / 2**c_wpfb_fil_in_backoff_w
     g_phasor_freq           : REAL := 1.0;    -- for N=64 point FFT choose channel in range -32.0 : 0.0 : 31.0
     g_phasor_phase          : REAL := 0.0     -- [0:2pi>, has nearly no effect on correlator output, because all inputs use the same phasor
   );
+  PORT (
+    tb_end : OUT STD_LOGIC
+  );
+  
 END tb_node_apertif_unb1_correlator_processing_output;
 
 
 ARCHITECTURE tb OF tb_node_apertif_unb1_correlator_processing_output IS
 
+  CONSTANT c_eth_clk_period             : TIME := 40 ns;  -- 25 MHz XO on UniBoard
   CONSTANT c_mm_clk_period              : TIME := 1 ns;
   CONSTANT c_dp_clk_period              : TIME := 5 ns;
   CONSTANT c_cross_clock_domain_latency : NATURAL := 20;
@@ -177,26 +225,37 @@ ARCHITECTURE tb OF tb_node_apertif_unb1_correlator_processing_output IS
 
   -- Simulation
   CONSTANT c_sim                   : BOOLEAN := TRUE;
-  
-  -- . DUT Input BG [t][bu_i]t_c][tc]
-  CONSTANT c_nof_sync              : NATURAL := g_nof_sync;                           -- [t], choose 3 or some more if BG with active xon does not overflow output
   CONSTANT c_nof_beamlets          : NATURAL := g_nof_beamlets;                       -- [bu_i], is 2 in tb, 88 in 8bit, 120 in 6bit beamlet mode
+  CONSTANT c_nof_bg_sync           : NATURAL := g_nof_bg_sync;                        -- [t], choose 3 or some more if BG with active xon does not overflow output
+  CONSTANT c_nof_cor_sync          : NATURAL := g_nof_bg_sync * c_nof_beamlets;
   CONSTANT c_nof_tchan_per_sync    : NATURAL := g_nof_tchan_per_sync;                 -- [t_c], is 12500, choose > number of taps of WPFB to simulate longer than the FIR impulse response
-  CONSTANT c_nof_points            : NATURAL := 64;                                   -- [tc], is 64
+  
+  -- . DUT TP [t][bu_i][ti]
+  --          [t][bu_i][t_c][tc]
+  CONSTANT c_nof_points            : NATURAL := 64;            -- [tc]
+  CONSTANT c_nof_channels          : NATURAL := c_nof_points;                         -- [ch] = [tc], is 64
   CONSTANT c_nof_tsub_per_sync     : NATURAL := c_nof_tchan_per_sync*c_nof_points;    -- [ti] = [t_c][tc] = [t_c][ch], is N_int = 12500 * 64 = 800000, is 10 * 64 = 640 in tb
-  CONSTANT c_nof_blocks_per_sync   : NATURAL := c_nof_beamlets*c_nof_tchan_per_sync;  -- [bu_i][t_c], is 2 * 10 = 20 in tb, 88 * 12500 = 1100000 in 8b, 1500000 in 6b beamlet mode
-  CONSTANT c_interleave_factor     : NATURAL := 2;                                    -- [pair], is 2
+  CONSTANT c_nof_blocks_per_sync   : NATURAL := c_nof_tsub_per_sync;   -- [ti] : 800000 blocks
+  CONSTANT c_interleave_factor     : NATURAL := 2;                     -- [pair], is Q_interleave = 2  
   
-  CONSTANT c_bg_block_period       : NATURAL := c_nof_points;  -- [tc]
-  CONSTANT c_bg_nof_clk_per_sync   : NATURAL := g_nof_beamlets * c_nof_tsub_per_sync * c_interleave_factor;  -- = 800000 * 256
+  CONSTANT c_block_period          : NATURAL := 256;  -- N_clk
+  CONSTANT c_nof_clk_per_sync      : NATURAL := c_nof_blocks_per_sync * c_block_period;  -- = 800000 * 256 = 1.024 * 200M
   
   -- . DUT frame [t][bsn][mi]
-  CONSTANT c_nof_frames_per_sync   : NATURAL := c_nof_tsub_per_sync;                 -- [bsn] = [t_c][tc] = [ti], is 800000, or 640 in tb
-  CONSTANT c_frame_size            : NATURAL := c_nof_beamlets;                      -- [mi] = [bu_i], is 88, 120, or 2 in tb (by choice frame, message size mi for t is equal to bu_i)
-  CONSTANT c_nof_valid_per_sync    : NATURAL := c_frame_size*c_nof_frames_per_sync;  -- [bsn][mi] = [bu_i][ti], is 640 * 2 = 1280 in tb, 88 * 800000 = 7040000 in 8bit, 9600000 in 6bit beamlet mode
+  CONSTANT c_nof_frames_per_sync   : NATURAL := c_nof_blocks_per_sync;   -- [bsn] = [t_c][tc] = [ti], is 800000, or 640 in tb
+  CONSTANT c_frame_size            : NATURAL := g_nof_beamlets; -- [mi] = [bu_i] : M_blk = N_blk / Q_interleave = 176/2 = 88, by choice processed
+                                                                -- 88 * 800000 samples are transported as 800000 * 88 samples
+  CONSTANT c_nof_valid_per_sync    : NATURAL := c_nof_frames_per_sync*c_frame_size;  -- [bsn][mi] = [bu_i][ti], is 640 * 2 = 1280 in tb, 88 * 800000 = 7040000 in 8bit, 9600000 in 6bit beamlet mode
+
+  -- . DUT input BG  
+  CONSTANT c_bg_block_size          : NATURAL := c_frame_size;  -- [mi]
+  CONSTANT c_bg_nof_blocks_per_sync : NATURAL := c_nof_frames_per_sync;  -- [bsn] = [ti], is 10 * 64 = 640 in tb, 12500 * 64 = 800000 on HW
+  
+  CONSTANT c_bg_duty_cycle_gap      : NATURAL := smallest(g_bg_duty_cycle_gap, c_block_period - c_frame_size * c_interleave_factor);
+  CONSTANT c_bg_block_period        : NATURAL := c_frame_size * c_interleave_factor + c_bg_duty_cycle_gap;
+  CONSTANT c_bg_nof_clk_per_sync    : NATURAL := c_nof_blocks_per_sync * c_bg_block_period;
   
   -- . correlator DUT input
-  CONSTANT c_nof_channels          : NATURAL := c_nof_points;                         -- [ch] = [tc], is 64
   CONSTANT c_nof_tp                : NATURAL := g_nof_pn * g_nof_10G;                 -- number of telescope paths
 
   CONSTANT c_nof_visibilities      : NATURAL := c_nof_tp * (c_nof_tp+1) / 2;
@@ -206,6 +265,13 @@ ARCHITECTURE tb OF tb_node_apertif_unb1_correlator_processing_output IS
   CONSTANT c_in_dat_w             : NATURAL := c_wpfb_apertif_channels.fil_in_dat_w;  -- =  8 bit
   CONSTANT c_in_complex_w         : NATURAL := c_nof_complex * c_in_dat_w;            -- = 16 bit, complex Im & Re
   CONSTANT c_cor_out_dat_w        : NATURAL := 32;
+  
+  -- . 1GbE output
+  CONSTANT c_eth_check_nof_packets  : NATURAL := c_nof_cor_sync * c_nof_channels;
+  CONSTANT c_eth_header_size        : NATURAL := 21;  -- (pad(2) + eth(14) + ip(20) + udp(8) + app(16+24))/4 = 84 / 4 
+  CONSTANT c_udp_payload_size       : NATURAL := c_nof_complex * c_nof_visibilities;
+  CONSTANT c_eth_packet_size        : NATURAL := c_eth_header_size + c_udp_payload_size;
+
                                      
   -- Phasor: exp(j*angle) = cos(angle) + j*sin(angle)
   -- A complex FFT of N points has N bins or channels: ch = -N/2:0:N/2-1.
@@ -263,7 +329,12 @@ ARCHITECTURE tb OF tb_node_apertif_unb1_correlator_processing_output IS
                                                               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                                               0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
                                                               
-  SIGNAL tb_end                   : STD_LOGIC := '0';
+  CONSTANT c_nof_visibility_streams   : NATURAL := 1;
+  
+  SIGNAL i_tb_end                 : STD_LOGIC := '0';
+  SIGNAL eth_done                 : STD_LOGIC := '0';
+  SIGNAL eth_clk                  : STD_LOGIC := '0';
+  SIGNAL eth1g_tse_clk            : STD_LOGIC;
   SIGNAL mm_clk                   : STD_LOGIC := '1';
   SIGNAL mm_rst                   : STD_LOGIC := '1';
   SIGNAL dp_clk                   : STD_LOGIC := '1';
@@ -272,33 +343,42 @@ ARCHITECTURE tb OF tb_node_apertif_unb1_correlator_processing_output IS
   
   SIGNAL bg_start                 : STD_LOGIC;
   SIGNAL bg_sync_cnt              : NATURAL := 0;
-  SIGNAL cor_src_out              : t_dp_sosi;
   SIGNAL cor_sync_cnt             : NATURAL;
+  SIGNAL cor_src_out              : t_dp_sosi;
   SIGNAL cor_expected_bsn         : NATURAL;
 
-  -- MM tb
-  SIGNAL tb_reg_diag_bg_mosi      : t_mem_mosi := c_mem_mosi_rst;
-  SIGNAL tb_reg_diag_bg_miso      : t_mem_miso;
-  SIGNAL tb_ram_diag_bg_mosi      : t_mem_mosi := c_mem_mosi_rst;
-  SIGNAL tb_ram_diag_bg_miso      : t_mem_miso;
+  SIGNAL udp_tx_snk_in_arr        : t_dp_sosi_arr(c_nof_visibility_streams-1 DOWNTO 0);
+  SIGNAL udp_tx_snk_out_arr       : t_dp_siso_arr(c_nof_visibility_streams-1 DOWNTO 0) := (OTHERS => c_dp_siso_rdy);
+  SIGNAL eth_sgout                : STD_LOGIC;
 
   -- MM dut
   -- . Input block generator
-  SIGNAL reg_diag_input_bg_mosi   : t_mem_mosi := c_mem_mosi_rst;
-  SIGNAL reg_diag_input_bg_miso   : t_mem_miso;
-  SIGNAL ram_diag_input_bg_mosi   : t_mem_mosi := c_mem_mosi_rst;
-  SIGNAL ram_diag_input_bg_miso   : t_mem_miso;
+  SIGNAL reg_diag_input_bg_mosi            : t_mem_mosi := c_mem_mosi_rst;
+  SIGNAL reg_diag_input_bg_miso            : t_mem_miso;
+  SIGNAL ram_diag_input_bg_mosi            : t_mem_mosi := c_mem_mosi_rst;
+  SIGNAL ram_diag_input_bg_miso            : t_mem_miso;
 
   -- . Force data
   SIGNAL reg_dp_force_data_parallel_mosi   : t_mem_mosi := c_mem_mosi_rst;
   SIGNAL reg_dp_force_data_parallel_miso   : t_mem_miso;
   SIGNAL reg_dp_force_data_serial_mosi     : t_mem_mosi := c_mem_mosi_rst;
   SIGNAL reg_dp_force_data_serial_miso     : t_mem_miso;
+  
+  -- . Visibility offload via 1GbE
+  SIGNAL reg_dp_offload_tx_hdr_dat_mosi    : t_mem_mosi := c_mem_mosi_rst;
+  SIGNAL reg_dp_offload_tx_hdr_dat_miso    : t_mem_miso;
+
+  SIGNAL reg_dp_xonoff_output_mosi         : t_mem_mosi := c_mem_mosi_rst;
+  SIGNAL reg_dp_xonoff_output_miso         : t_mem_miso;
 
 BEGIN
 
-  dp_clk <= NOT dp_clk OR tb_end AFTER c_dp_clk_period/2;
-  mm_clk <= NOT mm_clk OR tb_end AFTER c_mm_clk_period/2;
+  tb_end <= i_tb_end;
+  
+  eth_clk <= NOT eth_clk OR i_tb_end AFTER c_eth_clk_period/2;
+  
+  dp_clk <= NOT dp_clk OR i_tb_end AFTER c_dp_clk_period/2;
+  mm_clk <= NOT mm_clk OR i_tb_end AFTER c_mm_clk_period/2;
   dp_rst <= '1', '0' AFTER c_dp_clk_period*7;
   mm_rst <= '1', '0' AFTER c_mm_clk_period*7;
 
@@ -310,20 +390,25 @@ BEGIN
     proc_common_wait_until_low(dp_clk, dp_rst);
     proc_common_wait_until_low(mm_clk, dp_rst);
 
+    -- Offload enable
+    proc_mem_mm_bus_wr(0, 1, mm_clk, reg_dp_xonoff_output_miso, reg_dp_xonoff_output_mosi);
+    
+    -- Offload header write dst MAC just to check the MM address order (top down list is 36 DOWNTO 0)
+    proc_mem_mm_bus_wr(35, 16#0000DCEF#, mm_clk, reg_dp_offload_tx_hdr_dat_miso, reg_dp_offload_tx_hdr_dat_mosi);  -- dst mac hi
+    proc_mem_mm_bus_wr(34, 16#56789ABC#, mm_clk, reg_dp_offload_tx_hdr_dat_miso, reg_dp_offload_tx_hdr_dat_mosi);  -- dst mac lo
+    
     -- Prepare force data
-    IF c_tp_force_zero THEN
-      -- Write force data 0 to TP c_sel_tp
-      --  stream   reg     data
-      --      0      0        [0]     R/W  force enable or default disable for data pass on
-      --      0      1     [31:0]     R/W  force sosi data
-      --      0      2     [31:0]     R/W  force sosi re
-      --      0      3     [31:0]     R/W  force sosi im
-      --      1     4:7    idem
-      proc_mem_mm_bus_wr(c_tp_sel*4 + 0, 1, mm_clk, reg_dp_force_data_parallel_miso, reg_dp_force_data_parallel_mosi);  -- force enable
-      proc_mem_mm_bus_wr(c_tp_sel*4 + 1, 0, mm_clk, reg_dp_force_data_parallel_miso, reg_dp_force_data_parallel_mosi);  -- force data = 0 (no increment fixed by generic)
-      proc_mem_mm_bus_wr(c_tp_sel*4 + 2, 0, mm_clk, reg_dp_force_data_parallel_miso, reg_dp_force_data_parallel_mosi);  -- force re = 0 (no increment fixed by generic)
-      proc_mem_mm_bus_wr(c_tp_sel*4 + 3, 0, mm_clk, reg_dp_force_data_parallel_miso, reg_dp_force_data_parallel_mosi);  -- force im = 0 (no increment fixed by generic)
-    END IF;
+    --  -- Write force data 0 to TP c_sel_tp
+    --  --  stream   reg     data
+    --  --      0      0        [0]     R/W  force enable or default disable for data pass on
+    --  --      0      1     [31:0]     R/W  force sosi data
+    --  --      0      2     [31:0]     R/W  force sosi re
+    --  --      0      3     [31:0]     R/W  force sosi im
+    --  --      1     4:7    idem
+    --  proc_mem_mm_bus_wr(c_tp_sel*4 + 0, 1, mm_clk, reg_dp_force_data_parallel_miso, reg_dp_force_data_parallel_mosi);  -- force enable
+    --  proc_mem_mm_bus_wr(c_tp_sel*4 + 1, 0, mm_clk, reg_dp_force_data_parallel_miso, reg_dp_force_data_parallel_mosi);  -- force data = 0 (no increment fixed by generic)
+    --  proc_mem_mm_bus_wr(c_tp_sel*4 + 2, 0, mm_clk, reg_dp_force_data_parallel_miso, reg_dp_force_data_parallel_mosi);  -- force re = 0 (no increment fixed by generic)
+    --  proc_mem_mm_bus_wr(c_tp_sel*4 + 3, 0, mm_clk, reg_dp_force_data_parallel_miso, reg_dp_force_data_parallel_mosi);  -- force im = 0 (no increment fixed by generic)
       
     -- Write phasor waveform to BG
     FOR I IN 0 TO c_nof_points-1 LOOP
@@ -332,32 +417,19 @@ BEGIN
     proc_common_wait_some_cycles(mm_clk, c_cross_clock_domain_latency);
     proc_common_wait_some_cycles(dp_clk, c_cross_clock_domain_latency);
     -- Start the BG
-    proc_mem_mm_bus_wr(3, 0, mm_clk, reg_diag_input_bg_miso, reg_diag_input_bg_mosi);  -- gap = 0
-    proc_mem_mm_bus_wr(0, 1, mm_clk, reg_diag_input_bg_miso, reg_diag_input_bg_mosi);  -- enable
+    proc_mem_mm_bus_wr(3, c_bg_duty_cycle_gap, mm_clk, reg_diag_input_bg_miso, reg_diag_input_bg_mosi);  -- gap
+    proc_mem_mm_bus_wr(0,                   1, mm_clk, reg_diag_input_bg_miso, reg_diag_input_bg_mosi);  -- enable
     proc_common_gen_pulse(dp_clk, bg_start);
 
     WAIT;
   END PROCESS;
   
-  tb_end <= '1' WHEN rising_edge(dp_clk) AND bg_sync_cnt>c_nof_sync;
-  
-  p_bg_sync_cnt : PROCESS
-  BEGIN
-    bg_sync_cnt <= 0;
-    proc_common_wait_until_hi_lo(dp_clk, bg_start);
-    WHILE TRUE LOOP
-      bg_sync_cnt <= bg_sync_cnt + 1;
-      proc_common_wait_some_cycles(dp_clk, c_bg_nof_clk_per_sync);
-    END LOOP;
-    WAIT;  -- to avoid infinite loop warning due to process without explicite WAIT statement
-  END PROCESS;      
-    
-  u_dut : ENTITY work.node_apertif_unb1_correlator_processing
+  u_dut_proc : ENTITY work.node_apertif_unb1_correlator_processing
   GENERIC MAP (
     g_sim                 => c_sim,
     g_use_input_bg        => TRUE,
     g_use_dp_force_data   => TRUE,
-    g_use_wpfb            => TRUE,
+    g_use_wpfb            => g_use_wpfb,
     g_use_prefilter       => g_use_prefilter,    -- use default g_coefs_file_prefix
     g_use_prefilter_ones  => g_use_prefilter_ones,
     g_use_cor             => TRUE,
@@ -392,7 +464,135 @@ BEGIN
     reg_dp_force_data_serial_mosi   => reg_dp_force_data_serial_mosi,
     reg_dp_force_data_serial_miso   => reg_dp_force_data_serial_miso
   );
+  
+  u_dut_output : ENTITY work.node_apertif_unb1_correlator_output
+  GENERIC MAP (
+    g_sim              => c_sim,
+    g_nof_visibilities => c_nof_visibilities,
+    g_cor_out_dat_w    => c_cor_out_dat_w
+  )
+  PORT MAP (
+    mm_rst                         => mm_rst,
+    mm_clk                         => mm_clk,
+    dp_rst                         => dp_rst,
+    dp_clk                         => dp_clk,
+
+    snk_in                         => cor_src_out,
+
+    src_out                        => udp_tx_snk_in_arr(0), 
+    src_in                         => udp_tx_snk_out_arr(0),  
+
+    reg_dp_offload_tx_hdr_dat_mosi => reg_dp_offload_tx_hdr_dat_mosi,
+    reg_dp_offload_tx_hdr_dat_miso => reg_dp_offload_tx_hdr_dat_miso, 
 
+    reg_dp_xonoff_output_mosi      => reg_dp_xonoff_output_mosi,
+    reg_dp_xonoff_output_miso      => reg_dp_xonoff_output_miso,
+
+    ID                             => c_pn_id
+  );
+
+  -----------------------------------------------------------------------------
+  -- Use 1GbE for back pressure
+  -----------------------------------------------------------------------------
+  u_ctrl_unb1_board : ENTITY unb1_board_lib.ctrl_unb1_board
+  GENERIC MAP (
+    g_sim                     => c_sim,
+    g_sim_level               => 1,
+    --g_sim_flash_model         => FALSE,
+    --g_design_name             => g_design_name,
+    --g_design_note             => g_design_note,
+    --g_stamp_date              => g_stamp_date,
+    --g_stamp_time              => g_stamp_time, 
+    --g_stamp_svn               => g_stamp_svn, 
+    --g_fw_version              => c_fw_version,
+    --g_mm_clk_freq             => c_unb1_board_mm_clk_freq_50M,
+    g_udp_offload             => TRUE,
+    g_udp_offload_nof_streams => c_nof_visibility_streams,  --1
+    --g_use_phy                 => c_use_phy,
+    --g_aux                     => c_unb1_board_aux,
+    --g_dp_clk_use_pll          => TRUE,
+    g_xo_clk_use_pll          => TRUE
+  )
+  PORT MAP (
+    mm_clk_out               => OPEN,
+    mm_clk                   => mm_clk,
+    mm_rst                   => OPEN,
+
+    mm_locked                => '1',
+    mm_locked_out            => OPEN,
+
+    epcs_clk                 => '0',
+    epcs_clk_out             => OPEN,
+
+    dp_rst                   => OPEN,
+    dp_clk                   => OPEN,
+    dp_pps                   => OPEN,
+    dp_rst_in                => dp_rst,
+    dp_clk_in                => dp_clk,
+
+    cal_rec_clk              => OPEN,
+    
+    this_chip_id             => OPEN,    
+    
+    pout_wdi                 => '1',
+
+    udp_tx_sosi_arr          => udp_tx_snk_in_arr,
+    udp_tx_siso_arr          => udp_tx_snk_out_arr,
+       
+    CLK                      => dp_clk,
+    PPS                      => '0',
+    WDI                      => OPEN,
+    INTA                     => OPEN,
+    INTB                     => OPEN,
+    VERSION                  => (OTHERS=>'0'),
+    ID                       => c_pn_id,
+    TESTIO                   => OPEN,
+    sens_sc                  => OPEN,
+    sens_sd                  => OPEN,
+    
+    eth1g_tse_clk_out        => eth1g_tse_clk,
+    eth1g_tse_clk            => eth1g_tse_clk,
+    eth1g_mm_rst             => dp_rst,
+    eth1g_tse_mosi           => c_mem_mosi_rst,
+    eth1g_reg_mosi           => c_mem_mosi_rst,
+    eth1g_ram_mosi           => c_mem_mosi_rst,
+    ETH_clk                  => eth_clk,
+    ETH_SGIN                 => '0',
+    ETH_SGOUT                => eth_sgout
+  );
+
+  ------------------------------------------------------------------------------
+  -- Verify proper DUT output using Ethernet packet statistics
+  ------------------------------------------------------------------------------
+  u_eth_statistics : ENTITY eth_lib.eth_statistics
+    GENERIC MAP (
+      g_runtime_nof_packets => c_eth_check_nof_packets,
+      g_runtime_timeout     => 1000 us,
+      g_check_nof_valid     => TRUE,
+      g_check_nof_valid_ref => c_eth_check_nof_packets*c_eth_packet_size
+    )
+  PORT MAP (  
+    eth_serial_in => eth_sgout,
+    tb_end        => eth_done
+  );
+  
+  -- Stop simulation using severity FAILURE
+  proc_common_stop_simulation(g_tb_end, dp_clk, eth_done, i_tb_end);
+
+  -- Use bg_sync_cnt to control tb_end.
+  -- . bg_sync_cnt counts dut/bg_sosi.sync intervals based on c_bg_nof_clk_per_sync and increments after the bg_sosi.sync
+  p_bg_sync_cnt : PROCESS
+  BEGIN
+    bg_sync_cnt <= 0;
+    proc_common_wait_until_hi_lo(dp_clk, bg_start);
+    WHILE i_tb_end='0' LOOP
+      bg_sync_cnt <= bg_sync_cnt + 1;
+      proc_common_wait_some_cycles(dp_clk, c_bg_nof_clk_per_sync);
+    END LOOP;
+    WAIT;  -- to avoid infinite loop warning due to process without explicite WAIT statement
+  END PROCESS;
+
+  -- Use cor_sync_cnt to control verification
   p_verify_cor_src_out : PROCESS(dp_clk)
     VARIABLE v_ch           : NATURAL;
     VARIABLE v_sync_cnt     : NATURAL;
@@ -404,11 +604,13 @@ BEGIN
         v_ch := TO_UINT(cor_src_out.channel) MOD c_nof_channels;
         IF g_use_prefilter=FALSE THEN
           -- no prefilter, only fft
+          verify_activated <= '1';
           ASSERT SIGNED(cor_src_out.re(c_cor_out_dat_w-1 DOWNTO 0)) = c_exp_fft_channel_vis_re(v_ch) REPORT "Wrong FFT - correlator output real" SEVERITY ERROR;
           ASSERT SIGNED(cor_src_out.im(c_cor_out_dat_w-1 DOWNTO 0)) = c_exp_fft_channel_vis_im(v_ch) REPORT "Wrong FFT - correlator output imag" SEVERITY ERROR;
 
-        ELSIF bg_sync_cnt>1 THEN
+        ELSIF cor_sync_cnt>1 THEN
           -- skip first sync interval to flush out initial FIR impulse response, works be cause all beamlets in time have same input phasor
+          verify_activated <= '1';
           IF g_use_prefilter_ones=TRUE THEN
             -- transparant prefilter with ones coefficients
             ASSERT SIGNED(cor_src_out.re(c_cor_out_dat_w-1 DOWNTO 0)) = c_exp_transparant_channel_vis_re(v_ch) REPORT "Wrong PFB with transparant PFIR - correlator output real" SEVERITY ERROR;
@@ -422,15 +624,10 @@ BEGIN
         
         -- Verify BSN
         IF cor_src_out.sync='1' THEN
-          v_expected_bsn := (v_sync_cnt / c_nof_beamlets) * c_nof_beamlets * c_nof_tchan_per_sync;
+          v_expected_bsn := (v_sync_cnt / g_nof_beamlets) * c_nof_blocks_per_sync;
           ASSERT TO_UINT(cor_src_out.bsn) = v_expected_bsn REPORT "Wrong BSN in correlator output" SEVERITY ERROR;
           v_sync_cnt := v_sync_cnt + 1;
         END IF;
-        
-        -- Verify that tb actually does do verification
-        IF bg_sync_cnt>=c_nof_sync THEN
-          verify_activated <= '1';
-        END IF;
       END IF;
     END IF;
     -- For debug assing variable to signal for view in Wave window
@@ -438,9 +635,10 @@ BEGIN
     cor_expected_bsn <= v_expected_bsn;
   END PROCESS;
   
-  p_verify_activated : PROCESS(tb_end)
+  -- Verify that tb actually did do verification
+  p_verify_activated : PROCESS(i_tb_end)
   BEGIN
-    IF tb_end='1' THEN
+    IF i_tb_end='1' THEN
       ASSERT verify_activated='1' REPORT "TB verification was not done" SEVERITY ERROR;
     END IF;
   END PROCESS;