diff --git a/applications/lofar2/libraries/sdp/hdllib.cfg b/applications/lofar2/libraries/sdp/hdllib.cfg
index 6e7ad9ac344bfb31412ec917035554d5013d8ca7..8369aee4be263ba8176c1e9f5fd799569f5c09ad 100644
--- a/applications/lofar2/libraries/sdp/hdllib.cfg
+++ b/applications/lofar2/libraries/sdp/hdllib.cfg
@@ -22,6 +22,8 @@ synth_files =
     src/vhdl/sdp_beamformer_output.vhd
     src/vhdl/sdp_statistics_offload.vhd
     src/vhdl/sdp_crosslets_subband_select.vhd
+    src/vhdl/sdp_crosslets_remote.vhd
+    src/vhdl/sdp_crosslets_remote_v2.vhd
     src/vhdl/node_sdp_adc_input_and_timing.vhd
     src/vhdl/node_sdp_filterbank.vhd
     src/vhdl/node_sdp_oversampled_filterbank.vhd
@@ -37,12 +39,16 @@ test_bench_files =
     tb/vhdl/tb_sdp_crosslets_subband_select.vhd
     tb/vhdl/tb_sdp_beamformer_output.vhd
     tb/vhdl/tb_tb_sdp_beamformer_output.vhd
+    tb/vhdl/tb_sdp_beamformer_remote_ring.vhd
+    tb/vhdl/tb_sdp_crosslets_remote_ring.vhd
 
 regression_test_vhdl =
     tb/vhdl/tb_sdp_info.vhd
     tb/vhdl/tb_sdp_statistics_offload.vhd
     tb/vhdl/tb_tb_sdp_statistics_offload.vhd
     tb/vhdl/tb_sdp_crosslets_subband_select.vhd
+    tb/vhdl/tb_sdp_crosslets_remote_ring.vhd
+    tb/vhdl/tb_sdp_beamformer_remote_ring.vhd
     tb/vhdl/tb_tb_sdp_beamformer_output.vhd
 
 [modelsim_project_file]
diff --git a/applications/lofar2/libraries/sdp/src/vhdl/node_sdp_correlator.vhd b/applications/lofar2/libraries/sdp/src/vhdl/node_sdp_correlator.vhd
index d5c96ea99d864ddf79b199ce24fe9c532dc776bf..fa97a9f7d77b10986605677f326a2135b1a0b8e2 100644
--- a/applications/lofar2/libraries/sdp/src/vhdl/node_sdp_correlator.vhd
+++ b/applications/lofar2/libraries/sdp/src/vhdl/node_sdp_correlator.vhd
@@ -94,20 +94,8 @@ end node_sdp_correlator;
 
 architecture str of node_sdp_correlator is
   constant c_nof_controllers      : positive := 2;
-  constant c_block_size           : natural  := c_sdp_N_crosslets_max * c_sdp_S_pn;
-  constant c_block_size_longwords : natural  := ceil_div(c_block_size, 2);  -- 32b -> 64b
-  constant c_data_w               : natural  := c_sdp_W_crosslet * c_nof_complex;
-
--- The size for 1 block is probably already enough as the number of blocks received
--- on the remote input of the mux probably have enough gap time in between. Just
--- to be sure to not run into issues in the future, the fifo size is increased to
--- buffer the maximum nof blocks per block period.
-  constant c_mux_fifo_size   : natural  := 2**ceil_log2(g_P_sq * c_block_size_longwords);
--- c_fifo_fill_size should be at least 2 * c_block_size_longwords as dp_repack_data
--- repacks from 64bit to 32bit. Chosing 3x to have some room.
-  constant c_fifo_fill_size  : natural  := 2**ceil_log2(3 * c_block_size_longwords);
-
--- crosslet statistics offload
+
+  -- crosslet statistics offload
   signal ram_st_offload_copi           : t_mem_copi := c_mem_copi_rst;
   signal ram_st_offload_cipo           : t_mem_cipo := c_mem_cipo_rst;
 
@@ -117,21 +105,9 @@ architecture str of node_sdp_correlator is
   signal controller_cipo_arr           : t_mem_cipo_arr(0 to c_nof_controllers - 1) := (others => c_mem_cipo_rst);
 
   signal quant_sosi_arr                : t_dp_sosi_arr(c_sdp_P_pfb - 1 downto 0) := (others => c_dp_sosi_rst);
-  signal dp_bsn_sync_scheduler_src_out : t_dp_sosi := c_dp_sosi_rst;
   signal xsel_sosi                     : t_dp_sosi := c_dp_sosi_rst;
-  signal xsel_data_sosi                : t_dp_sosi := c_dp_sosi_rst;
-  signal local_sosi                    : t_dp_sosi := c_dp_sosi_rst;
-
   signal new_interval                  : std_logic;
 
-  signal ring_mux_sosi                 : t_dp_sosi := c_dp_sosi_rst;
-  signal ring_mux_siso                 : t_dp_siso := c_dp_siso_rdy;
-  signal dp_fifo_fill_sosi             : t_dp_sosi := c_dp_sosi_rst;
-  signal dp_fifo_fill_siso             : t_dp_siso := c_dp_siso_rdy;
-  signal rx_sosi                       : t_dp_sosi := c_dp_sosi_rst;
-  signal dispatch_invert_sosi_arr      : t_dp_sosi_arr(0 to g_P_sq - 1) := (others => c_dp_sosi_rst);
-  signal dispatch_sosi_arr             : t_dp_sosi_arr(g_P_sq - 1 downto 0) := (others => c_dp_sosi_rst);
-
   signal crosslets_sosi                : t_dp_sosi  := c_dp_sosi_rst;
   signal crosslets_copi                : t_mem_copi := c_mem_copi_rst;
   signal crosslets_cipo_arr            : t_mem_cipo_arr(g_P_sq - 1 downto 0) := (others => c_mem_cipo_rst);
@@ -203,168 +179,33 @@ begin
   xst_bs_sosi <= xsel_sosi;
 
   ---------------------------------------------------------------
-  -- Repack 32b to 64b
-  ---------------------------------------------------------------
-  -- repacking xsel re/im to data field.
-  p_wire_xsel_sosi : process(xsel_sosi)
-  begin
-    xsel_data_sosi <= xsel_sosi;
-    xsel_data_sosi.data(                c_sdp_W_crosslet - 1 downto 0)                <= xsel_sosi.re(c_sdp_W_crosslet - 1 downto 0);
-    xsel_data_sosi.data(c_nof_complex * c_sdp_W_crosslet - 1 downto c_sdp_W_crosslet) <= xsel_sosi.im(c_sdp_W_crosslet - 1 downto 0);
-  end process;
-
-  u_dp_repack_data_local : entity dp_lib.dp_repack_data
-  generic map (
-    g_in_dat_w       => c_data_w,
-    g_in_nof_words   => c_longword_w / c_data_w,
-    g_out_dat_w      => c_longword_w,
-    g_out_nof_words  => 1,
-    g_pipeline_ready => true  -- Needed for src_in.ready to snk_out.ready.
-  )
-  port map (
-    rst => dp_rst,
-    clk => dp_clk,
-
-    snk_in  => xsel_data_sosi,
-    src_out => local_sosi
-  );
-
-  ---------------------------------------------------------------
-  -- ring_mux
-  ---------------------------------------------------------------
-  u_ring_mux : entity ring_lib.ring_mux
-  generic map (
-    g_bsn_w        => c_dp_stream_bsn_w,
-    g_data_w       => c_longword_w,
-    g_channel_w    => c_word_w,
-    g_use_error    => false,
-    g_fifo_size    => array_init(c_mux_fifo_size, 2)
-  )
-  port map (
-    dp_clk => dp_clk,
-    dp_rst => dp_rst,
-
-    remote_sosi => from_ri_sosi,
-    local_sosi  => local_sosi,
-    mux_sosi    => ring_mux_sosi,
-    mux_siso    => ring_mux_siso
-  );
-
-  to_ri_sosi <= ring_mux_sosi;
-
-  -- fill fifo to remove gaps
-  u_dp_fifo_fill_eop : entity dp_lib.dp_fifo_fill_eop
-  generic map (
-    g_data_w         => c_longword_w,
-    g_bsn_w          => c_dp_stream_bsn_w,
-    g_empty_w        => c_dp_stream_empty_w,
-    g_channel_w      => c_dp_stream_channel_w,
-    g_error_w        => c_dp_stream_error_w,
-    g_use_bsn        => true,
-    g_use_empty      => true,
-    g_use_channel    => true,
-    g_use_error      => true,
-    g_use_sync       => true,
-    g_fifo_fill      => c_block_size_longwords,
-    g_fifo_size      => c_fifo_fill_size
-  )
-  port map (
-    wr_rst      => dp_rst,
-    wr_clk      => dp_clk,
-    rd_rst      => dp_rst,
-    rd_clk      => dp_clk,
-
-    snk_out     => ring_mux_siso,
-    snk_in      => ring_mux_sosi,
-
-    src_in      => dp_fifo_fill_siso,
-    src_out     => dp_fifo_fill_sosi
-  );
-
-  ---------------------------------------------------------------
-  -- Repack 64b to 32b
+  -- Local and remote crosslets
   ---------------------------------------------------------------
-  u_dp_repack_data_rx : entity dp_lib.dp_repack_data
+  u_sdp_crosslets_remote : entity work.sdp_crosslets_remote_v2
   generic map (
-    g_in_dat_w       => c_longword_w,
-    g_in_nof_words   => 1,
-    g_out_dat_w      => c_data_w,
-    g_out_nof_words  => c_longword_w / c_data_w,
-    g_pipeline_ready => true  -- Needed for src_in.ready to snk_out.ready.
+    g_P_sq => g_P_sq
   )
   port map (
-    rst => dp_rst,
-    clk => dp_clk,
-
-    snk_in  => dp_fifo_fill_sosi,
-    snk_out => dp_fifo_fill_siso,
-    src_out => rx_sosi
-  );
-
-  ---------------------------------------------------------------
-  -- dp_demux
-  ---------------------------------------------------------------
-  u_dp_demux : entity dp_lib.dp_demux
-  generic map (
-    g_mode              => 0,
-    g_nof_output        => g_P_sq,
-    g_remove_channel_lo => false,
-    g_sel_ctrl_invert   => true  -- TRUE when indexed (g_nof_input-1 DOWNTO 0)
-  )
-  port map (
-    rst => dp_rst,
-    clk => dp_clk,
-
-    snk_in      => rx_sosi,
-    src_out_arr => dispatch_invert_sosi_arr
-  );
-
-  dispatch_sosi_arr <= func_dp_stream_arr_reverse_range(dispatch_invert_sosi_arr);
-
-  ---------------------------------------------------------------
-  -- dp_bsn_aligner_v2
-  ---------------------------------------------------------------
-  u_mmp_dp_bsn_align_v2 : entity dp_lib.mmp_dp_bsn_align_v2
-  generic map(
-    -- for dp_bsn_align_v2
-    g_nof_streams             => g_P_sq,
-    g_bsn_latency_max         => 2,
-    g_nof_aligners_max        => 1,  -- 1 for Access scheme 3.
-    g_block_size              => c_block_size,
-    g_data_w                  => c_data_w,
-    g_use_mm_output           => true,
-    g_rd_latency              => 1,  -- Required for st_xst
-    -- for mms_dp_bsn_monitor_v2
-    -- Using c_sdp_N_clk_sync_timeout_xsub as g_nof_clk_per_sync is used for BSN monitor timeout.
-    g_nof_clk_per_sync        => c_sdp_N_clk_sync_timeout_xsub,
-    g_nof_input_bsn_monitors  => g_P_sq,
-    g_use_bsn_output_monitor  => true
-    )
-  port map (
-    -- Memory-mapped clock domain
-    mm_rst                  => mm_rst,
-    mm_clk                  => mm_clk,
-
-    reg_bsn_align_copi      => reg_bsn_align_copi,
-    reg_bsn_align_cipo      => reg_bsn_align_cipo,
-
-    reg_input_monitor_copi  => reg_bsn_monitor_v2_bsn_align_input_copi,
-    reg_input_monitor_cipo  => reg_bsn_monitor_v2_bsn_align_input_cipo,
-
-    reg_output_monitor_copi => reg_bsn_monitor_v2_bsn_align_output_copi,
-    reg_output_monitor_cipo => reg_bsn_monitor_v2_bsn_align_output_cipo,
-
-    -- Streaming clock domain
-    dp_rst     => dp_rst,
-    dp_clk     => dp_clk,
-
-    -- Streaming input
-    in_sosi_arr => dispatch_sosi_arr,
-
-    -- Output via local MM interface in dp_clk domain, when g_use_mm_output = TRUE.
-    mm_sosi     => crosslets_sosi,
-    mm_copi     => crosslets_copi,
-    mm_cipo_arr => crosslets_cipo_arr
+    dp_clk             => dp_clk,
+    dp_rst             => dp_rst,
+
+    xsel_sosi          => xsel_sosi,
+    from_ri_sosi       => from_ri_sosi,
+    to_ri_sosi         => to_ri_sosi,
+
+    crosslets_sosi     => crosslets_sosi,
+    crosslets_copi     => crosslets_copi,
+    crosslets_cipo_arr => crosslets_cipo_arr,
+
+    mm_rst             => mm_rst,
+    mm_clk             => mm_clk,
+
+    reg_bsn_align_copi                       => reg_bsn_align_copi,
+    reg_bsn_align_cipo                       => reg_bsn_align_cipo,
+    reg_bsn_monitor_v2_bsn_align_input_copi  => reg_bsn_monitor_v2_bsn_align_input_copi,
+    reg_bsn_monitor_v2_bsn_align_input_cipo  => reg_bsn_monitor_v2_bsn_align_input_cipo,
+    reg_bsn_monitor_v2_bsn_align_output_copi => reg_bsn_monitor_v2_bsn_align_output_copi,
+    reg_bsn_monitor_v2_bsn_align_output_cipo => reg_bsn_monitor_v2_bsn_align_output_cipo
   );
 
   ---------------------------------------------------------------
@@ -397,8 +238,8 @@ begin
   ---------------------------------------------------------------
   -- Connect 2 mm_controllers to the common_mem_mux output
   controller_copi_arr(0)  <= ram_st_xsq_copi;  -- MM access via QSYS MM bus
-  ram_st_xsq_cipo     <= controller_cipo_arr(0);
   controller_copi_arr(1)  <= ram_st_offload_copi;  -- MM access by UDP offload
+  ram_st_xsq_cipo     <= controller_cipo_arr(0);
   ram_st_offload_cipo <= controller_cipo_arr(1);
 
   u_mem_controller_mux : entity mm_lib.mm_master_mux
diff --git a/applications/lofar2/libraries/sdp/src/vhdl/sdp_beamformer_remote.vhd b/applications/lofar2/libraries/sdp/src/vhdl/sdp_beamformer_remote.vhd
index 11f74388b5f5cb189b8d17adfe68ea0044199480..00295713564f8764bcbd741e7a212a378b4c1f1b 100644
--- a/applications/lofar2/libraries/sdp/src/vhdl/sdp_beamformer_remote.vhd
+++ b/applications/lofar2/libraries/sdp/src/vhdl/sdp_beamformer_remote.vhd
@@ -37,11 +37,14 @@ use dp_lib.dp_stream_pkg.all;
 use work.sdp_pkg.all;
 
 entity sdp_beamformer_remote is
+  generic (
+    g_nof_aligners_max  : natural := c_sdp_N_pn_max
+  );
   port (
-    dp_clk      : in  std_logic;
-    dp_rst      : in  std_logic;
+    dp_clk        : in  std_logic;
+    dp_rst        : in  std_logic;
 
-    rn_index    : in  natural range 0 to c_sdp_N_pn_max - 1 := 0;
+    rn_index      : in  natural range 0 to c_sdp_N_pn_max - 1 := 0;
 
     local_bf_sosi : in  t_dp_sosi;
     from_ri_sosi  : in  t_dp_sosi;
@@ -65,13 +68,26 @@ end sdp_beamformer_remote;
 architecture str of sdp_beamformer_remote is
   constant c_data_w                : natural := c_nof_complex * c_sdp_W_beamlet_sum;
   constant c_block_size            : natural := c_sdp_S_sub_bf * c_sdp_N_pol_bf;
-  constant c_fifo_size             : natural := 2**ceil_log2((c_block_size * 9) / 16);  -- 9/16 = 36/64, 1 block of 64 bit words rounded to the next power of 2 = 1024.
+  constant c_fifo_size             : natural := 2**ceil_log2((c_block_size * 9) / 16);  -- 9/16 = 36/64, 1 block of
+                                                -- 64 bit words rounded to the next power of 2 = 1024.
+
+  -- Max 2 blocks latency per node in chain. Use c_bsn_latency_first_node = 1
+  -- for first node is possible, because it does not have to align with remote
+  -- input. By using c_bsn_latency_first_node = 1 the circular buffer size
+  -- becomes true_log_pow2(1 + g_nof_aligners_max * c_bsn_latency_max +
+  -- c_bsn_latency_first_node) = true_log_pow2(1 + (16 - 1) * 2 + 1) = 32
+  -- blocks, instead of true_log_pow2(1 + 16 * 2) = 64 blocks.
+  constant c_bsn_latency_max        : natural := 2;
+  constant c_bsn_latency_first_node : natural := 1;
+
+  signal chain_node_index        : natural range 0 to c_sdp_N_pn_max - 1 := 0;
 
-  signal dispatch_sosi_arr       : t_dp_sosi_arr(c_dual - 1 downto 0)  := (others => c_dp_sosi_rst);  -- 1 for local, 1 for remote.
+  -- c_sdp_P_sum = 2 streams, 1 for local, 1 for remote
+  signal dispatch_sosi_arr       : t_dp_sosi_arr(c_sdp_P_sum - 1 downto 0)  := (others => c_dp_sosi_rst);
   signal dp_fifo_sosi            : t_dp_sosi := c_dp_sosi_rst;
   signal dp_fifo_siso            : t_dp_siso := c_dp_siso_rdy;
-  signal beamlets_data_sosi_arr  : t_dp_sosi_arr(c_dual - 1 downto 0)  := (others => c_dp_sosi_rst);
-  signal beamlets_sosi_arr       : t_dp_sosi_arr(c_dual - 1 downto 0)  := (others => c_dp_sosi_rst);
+  signal beamlets_data_sosi_arr  : t_dp_sosi_arr(c_sdp_P_sum - 1 downto 0)  := (others => c_dp_sosi_rst);
+  signal beamlets_sosi_arr       : t_dp_sosi_arr(c_sdp_P_sum - 1 downto 0)  := (others => c_dp_sosi_rst);
   signal i_bf_sum_sosi           : t_dp_sosi := c_dp_sosi_rst;
   signal bf_sum_data_sosi        : t_dp_sosi := c_dp_sosi_rst;
 begin
@@ -79,8 +95,10 @@ begin
   p_wire_local_bf_sosi : process(local_bf_sosi)
   begin
     dispatch_sosi_arr(0) <= local_bf_sosi;
-    dispatch_sosi_arr(0).data(c_sdp_W_beamlet_sum - 1 downto 0)                   <= local_bf_sosi.re(c_sdp_W_beamlet_sum - 1 downto 0);
-    dispatch_sosi_arr(0).data(c_data_w - 1            downto c_sdp_W_beamlet_sum) <= local_bf_sosi.im(c_sdp_W_beamlet_sum - 1 downto 0);
+    dispatch_sosi_arr(0).data(c_sdp_W_beamlet_sum - 1 downto 0) <=
+                              local_bf_sosi.re(c_sdp_W_beamlet_sum - 1 downto 0);
+    dispatch_sosi_arr(0).data(c_data_w - 1 downto c_sdp_W_beamlet_sum) <=
+                              local_bf_sosi.im(c_sdp_W_beamlet_sum - 1 downto 0);
   end process;
 
   ---------------------------------------------------------------
@@ -126,20 +144,30 @@ begin
   ---------------------------------------------------------------
   -- dp_bsn_aligner_v2
   ---------------------------------------------------------------
+
+  -- The SDP beamformer starts at ring node 0 and outputs at the last ring
+  -- node, therefore the chain_node_index = the rn_index. The chain_node_index
+  -- does not wrap, because it starts at ring node 0. Therefore a design with
+  -- an SDP beamformer that is defined for g_nof_aligners_max = c_sdp_N_pn_max
+  -- = 16 will also work in a ring with less nodes.
+  chain_node_index <= rn_index;
+
   u_mmp_dp_bsn_align_v2 : entity dp_lib.mmp_dp_bsn_align_v2
   generic map(
     -- for dp_bsn_align_v2
-    g_nof_streams             => c_dual,
-    g_bsn_latency_max         => 2,  -- max 2 blocks latency
-    g_nof_aligners_max        => c_sdp_N_pn_max,
-    g_block_size              => c_block_size,
-    g_data_w                  => c_data_w,
-    g_use_mm_output           => false,
-    g_rd_latency              => 1,
+    g_nof_streams               => c_sdp_P_sum,
+    g_bsn_latency_max           => c_bsn_latency_max,
+    g_bsn_latency_first_node    => c_bsn_latency_first_node,
+    g_nof_aligners_max          => g_nof_aligners_max,
+    g_block_size                => c_block_size,
+    g_data_w                    => c_data_w,
+    g_use_mm_output             => false,
+    g_rd_latency                => 1,
     -- for mms_dp_bsn_monitor_v2
-    g_nof_clk_per_sync        => c_sdp_N_clk_sync_timeout,  -- Using c_sdp_N_clk_sync_timeout as g_nof_clk_per_sync is used for BSN monitor timeout.
-    g_nof_input_bsn_monitors  => c_dual,
-    g_use_bsn_output_monitor  => true
+    g_nof_clk_per_sync          => c_sdp_N_clk_sync_timeout,  -- Using c_sdp_N_clk_sync_timeout as g_nof_clk_per_sync
+                                                              -- is used for BSN monitor timeout.
+    g_nof_input_bsn_monitors    => c_sdp_P_sum,
+    g_use_bsn_output_monitor    => true
     )
   port map (
     -- Memory-mapped clock domain
@@ -159,7 +187,7 @@ begin
     dp_rst     => dp_rst,
     dp_clk     => dp_clk,
 
-    node_index => rn_index,
+    chain_node_index => chain_node_index,
 
     -- Streaming input
     in_sosi_arr  => dispatch_sosi_arr,
@@ -182,7 +210,7 @@ begin
   ---------------------------------------------------------------
   u_dp_complex_add : entity dp_lib.dp_complex_add
   generic map(
-    g_nof_inputs => c_dual,
+    g_nof_inputs => c_sdp_P_sum,
     g_data_w => c_sdp_W_beamlet_sum
   )
   port map(
diff --git a/applications/lofar2/libraries/sdp/src/vhdl/sdp_crosslets_remote.vhd b/applications/lofar2/libraries/sdp/src/vhdl/sdp_crosslets_remote.vhd
new file mode 100644
index 0000000000000000000000000000000000000000..97c1941ff2278c6f0218008e9a96d170e5b0e439
--- /dev/null
+++ b/applications/lofar2/libraries/sdp/src/vhdl/sdp_crosslets_remote.vhd
@@ -0,0 +1,267 @@
+-------------------------------------------------------------------------------
+--
+-- Copyright 2021
+-- ASTRON (Netherlands Institute for Radio Astronomy) <http://www.astron.nl/>
+-- P.O.Box 2, 7990 AA Dwingeloo, The Netherlands
+--
+-- Licensed under the Apache License, Version 2.0 (the "License");
+-- you may not use this file except in compliance with the License.
+-- You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+-------------------------------------------------------------------------------
+
+-------------------------------------------------------------------------------
+--
+-- Author: R. van der Walle, E. Kooistra
+-- Purpose:
+-- . Implements the functionality for remote crosslets IO and aligning the
+--   local and remote crosslets in the node_sdp_correlator of the LOFAR2
+--   SDPFW design.
+-- Description:
+-------------------------------------------------------------------------------
+
+library IEEE, common_lib, dp_lib, reorder_lib, st_lib, mm_lib, ring_lib;
+use IEEE.std_logic_1164.all;
+use common_lib.common_pkg.all;
+use common_lib.common_mem_pkg.all;
+use common_lib.common_network_layers_pkg.all;
+use dp_lib.dp_stream_pkg.all;
+use ring_lib.ring_pkg.all;
+use work.sdp_pkg.all;
+
+entity sdp_crosslets_remote is
+  generic (
+    g_P_sq             : natural := c_sdp_P_sq
+  );
+  port (
+    dp_clk             : in  std_logic;
+    dp_rst             : in  std_logic;
+
+    xsel_sosi          : in  t_dp_sosi;
+    from_ri_sosi       : in  t_dp_sosi := c_dp_sosi_rst;
+    to_ri_sosi         : out t_dp_sosi;
+
+    crosslets_sosi     : out t_dp_sosi;
+    crosslets_copi     : in  t_mem_copi := c_mem_copi_rst;
+    crosslets_cipo_arr : out t_mem_cipo_arr(g_P_sq - 1 downto 0);
+
+    mm_rst             : in  std_logic;
+    mm_clk             : in  std_logic;
+
+    reg_bsn_align_copi                       : in  t_mem_copi := c_mem_copi_rst;
+    reg_bsn_align_cipo                       : out t_mem_cipo;
+    reg_bsn_monitor_v2_bsn_align_input_copi  : in  t_mem_copi := c_mem_copi_rst;
+    reg_bsn_monitor_v2_bsn_align_input_cipo  : out t_mem_cipo;
+    reg_bsn_monitor_v2_bsn_align_output_copi : in  t_mem_copi := c_mem_copi_rst;
+    reg_bsn_monitor_v2_bsn_align_output_cipo : out t_mem_cipo
+  );
+end sdp_crosslets_remote;
+
+architecture str of sdp_crosslets_remote is
+  constant c_block_size           : natural := c_sdp_N_crosslets_max * c_sdp_S_pn;
+  constant c_block_size_longwords : natural := ceil_div(c_block_size, 2);  -- 32b -> 64b
+  constant c_data_w               : natural := c_sdp_W_crosslet * c_nof_complex;
+  -- The channel field carries the index of time multiplexed crosslet packets
+  constant c_use_channel          : boolean := true;
+  constant c_channel_w            : natural := ceil_log2(g_P_sq);
+  -- With 32b data repacked in 64b one empty bit is enough. For crosslets the number
+  -- of 32b words is c_block_size is even, so empty will be 0 always. However do
+  -- support odd sizes, to be save.
+  constant c_use_empty            : boolean := true;
+  constant c_empty_w              : natural := 1;
+  -- The from_ri_sosi only carries correct packets, so error field is not used.
+  constant c_use_error            : boolean := false;
+
+  -- The size for 1 block is probably already enough as the number of blocks received
+  -- on the remote input of the mux probably have enough gap time in between. Just
+  -- to be sure to not run into issues in the future, the fifo size is increased to
+  -- buffer the maximum nof blocks per block period.
+  constant c_mux_fifo_size   : natural  := 2**ceil_log2(g_P_sq * c_block_size_longwords);
+  -- c_fifo_fill_size should be at least 2 * c_block_size_longwords as dp_repack_data
+  -- repacks from 64bit to 32bit. Chosing 3x to have some room.
+  constant c_fifo_fill_size  : natural  := 2**ceil_log2(3 * c_block_size_longwords);
+
+  signal xsel_data_sosi                : t_dp_sosi := c_dp_sosi_rst;
+  signal local_sosi                    : t_dp_sosi := c_dp_sosi_rst;
+
+  signal ring_mux_sosi                 : t_dp_sosi := c_dp_sosi_rst;
+  signal ring_mux_siso                 : t_dp_siso := c_dp_siso_rdy;
+  signal dp_fifo_fill_sosi             : t_dp_sosi := c_dp_sosi_rst;
+  signal dp_fifo_fill_siso             : t_dp_siso := c_dp_siso_rdy;
+  signal rx_sosi                       : t_dp_sosi := c_dp_sosi_rst;
+  signal dispatch_invert_sosi_arr      : t_dp_sosi_arr(0 to g_P_sq - 1) := (others => c_dp_sosi_rst);
+  signal dispatch_sosi_arr             : t_dp_sosi_arr(g_P_sq - 1 downto 0) := (others => c_dp_sosi_rst);
+begin
+  ---------------------------------------------------------------
+  -- Repack 32b to 64b
+  ---------------------------------------------------------------
+  -- repacking xsel re/im to data field.
+  p_wire_xsel_sosi : process(xsel_sosi)
+  begin
+    xsel_data_sosi <= xsel_sosi;
+    xsel_data_sosi.data(                c_sdp_W_crosslet - 1 downto 0)                <= xsel_sosi.re(c_sdp_W_crosslet - 1 downto 0);
+    xsel_data_sosi.data(c_nof_complex * c_sdp_W_crosslet - 1 downto c_sdp_W_crosslet) <= xsel_sosi.im(c_sdp_W_crosslet - 1 downto 0);
+  end process;
+
+  u_dp_repack_data_local : entity dp_lib.dp_repack_data
+  generic map (
+    g_in_dat_w       => c_data_w,
+    g_in_nof_words   => c_longword_w / c_data_w,
+    g_out_dat_w      => c_longword_w,
+    g_out_nof_words  => 1,
+    g_pipeline_ready => true  -- Needed for src_in.ready to snk_out.ready.
+  )
+  port map (
+    rst => dp_rst,
+    clk => dp_clk,
+
+    snk_in  => xsel_data_sosi,
+    src_out => local_sosi
+  );
+
+  ---------------------------------------------------------------
+  -- ring_mux
+  ---------------------------------------------------------------
+  u_ring_mux : entity ring_lib.ring_mux
+  generic map (
+    g_bsn_w        => c_dp_stream_bsn_w,
+    g_data_w       => c_longword_w,
+    g_channel_w    => c_word_w,
+    g_use_error    => c_use_error,
+    g_fifo_size    => array_init(c_mux_fifo_size, 2)
+  )
+  port map (
+    dp_clk => dp_clk,
+    dp_rst => dp_rst,
+
+    remote_sosi => from_ri_sosi,
+    local_sosi  => local_sosi,
+    mux_sosi    => ring_mux_sosi,
+    mux_siso    => ring_mux_siso
+  );
+
+  to_ri_sosi <= ring_mux_sosi;
+
+  -- fill fifo to remove valid gaps that occur due to repack 32b/64b in local_sosi,
+  -- the from_ri_sosi has no valid gaps during block.
+  u_dp_fifo_fill_eop : entity dp_lib.dp_fifo_fill_eop
+  generic map (
+    g_data_w         => c_longword_w,
+    g_bsn_w          => c_dp_stream_bsn_w,
+    g_empty_w        => c_empty_w,
+    g_channel_w      => c_channel_w,
+    g_use_bsn        => true,
+    g_use_empty      => c_use_empty,
+    g_use_channel    => c_use_channel,
+    g_use_error      => c_use_error,
+    g_use_sync       => true,
+    g_fifo_fill      => c_block_size_longwords,
+    g_fifo_size      => c_fifo_fill_size
+  )
+  port map (
+    wr_rst      => dp_rst,
+    wr_clk      => dp_clk,
+    rd_rst      => dp_rst,
+    rd_clk      => dp_clk,
+
+    snk_out     => ring_mux_siso,
+    snk_in      => ring_mux_sosi,
+
+    src_in      => dp_fifo_fill_siso,
+    src_out     => dp_fifo_fill_sosi
+  );
+
+  ---------------------------------------------------------------
+  -- Repack 64b to 32b
+  ---------------------------------------------------------------
+  u_dp_repack_data_rx : entity dp_lib.dp_repack_data
+  generic map (
+    g_in_dat_w       => c_longword_w,
+    g_in_nof_words   => 1,
+    g_out_dat_w      => c_data_w,
+    g_out_nof_words  => c_longword_w / c_data_w,
+    g_pipeline_ready => true  -- Needed for src_in.ready to snk_out.ready.
+  )
+  port map (
+    rst => dp_rst,
+    clk => dp_clk,
+
+    snk_in  => dp_fifo_fill_sosi,
+    snk_out => dp_fifo_fill_siso,
+    src_out => rx_sosi
+  );
+
+  ---------------------------------------------------------------
+  -- dp_demux
+  ---------------------------------------------------------------
+  u_dp_demux : entity dp_lib.dp_demux
+  generic map (
+    g_mode              => 0,
+    g_nof_output        => g_P_sq,
+    g_remove_channel_lo => false,
+    g_sel_ctrl_invert   => true  -- TRUE when indexed (g_nof_input-1 DOWNTO 0)
+  )
+  port map (
+    rst => dp_rst,
+    clk => dp_clk,
+
+    snk_in      => rx_sosi,
+    src_out_arr => dispatch_invert_sosi_arr
+  );
+
+  dispatch_sosi_arr <= func_dp_stream_arr_reverse_range(dispatch_invert_sosi_arr);
+
+  ---------------------------------------------------------------
+  -- dp_bsn_aligner_v2
+  ---------------------------------------------------------------
+  u_mmp_dp_bsn_align_v2 : entity dp_lib.mmp_dp_bsn_align_v2
+  generic map(
+    -- for dp_bsn_align_v2
+    g_nof_streams             => g_P_sq,
+    g_bsn_latency_max         => 2,
+    g_nof_aligners_max        => 1,  -- 1 for Access scheme 3.
+    g_block_size              => c_block_size,
+    g_data_w                  => c_data_w,
+    g_use_mm_output           => true,
+    g_rd_latency              => 1,  -- Required for st_xst
+    -- for mms_dp_bsn_monitor_v2
+    -- Using c_sdp_N_clk_sync_timeout_xsub as g_nof_clk_per_sync is used for BSN monitor timeout.
+    g_nof_clk_per_sync        => c_sdp_N_clk_sync_timeout_xsub,
+    g_nof_input_bsn_monitors  => g_P_sq,
+    g_use_bsn_output_monitor  => true
+    )
+  port map (
+    -- Memory-mapped clock domain
+    mm_rst                  => mm_rst,
+    mm_clk                  => mm_clk,
+
+    reg_bsn_align_copi      => reg_bsn_align_copi,
+    reg_bsn_align_cipo      => reg_bsn_align_cipo,
+
+    reg_input_monitor_copi  => reg_bsn_monitor_v2_bsn_align_input_copi,
+    reg_input_monitor_cipo  => reg_bsn_monitor_v2_bsn_align_input_cipo,
+
+    reg_output_monitor_copi => reg_bsn_monitor_v2_bsn_align_output_copi,
+    reg_output_monitor_cipo => reg_bsn_monitor_v2_bsn_align_output_cipo,
+
+    -- Streaming clock domain
+    dp_rst     => dp_rst,
+    dp_clk     => dp_clk,
+
+    -- Streaming input
+    in_sosi_arr => dispatch_sosi_arr,
+
+    -- Output via local MM interface in dp_clk domain, when g_use_mm_output = TRUE.
+    mm_sosi     => crosslets_sosi,
+    mm_copi     => crosslets_copi,
+    mm_cipo_arr => crosslets_cipo_arr
+  );
+end str;
diff --git a/applications/lofar2/libraries/sdp/src/vhdl/sdp_crosslets_remote_v2.vhd b/applications/lofar2/libraries/sdp/src/vhdl/sdp_crosslets_remote_v2.vhd
new file mode 100644
index 0000000000000000000000000000000000000000..2f283a380342a2fe65510774a4f33dcc95071cb0
--- /dev/null
+++ b/applications/lofar2/libraries/sdp/src/vhdl/sdp_crosslets_remote_v2.vhd
@@ -0,0 +1,273 @@
+-------------------------------------------------------------------------------
+--
+-- Copyright 2021
+-- ASTRON (Netherlands Institute for Radio Astronomy) <http://www.astron.nl/>
+-- P.O.Box 2, 7990 AA Dwingeloo, The Netherlands
+--
+-- Licensed under the Apache License, Version 2.0 (the "License");
+-- you may not use this file except in compliance with the License.
+-- You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+-------------------------------------------------------------------------------
+
+-------------------------------------------------------------------------------
+--
+-- Author: R. van der Walle, E. Kooistra
+-- Purpose:
+-- . Implements the functionality for remote crosslets IO and aligning the
+--   local and remote crosslets in the node_sdp_correlator of the LOFAR2
+--   SDPFW design.
+-- Description:
+-- . Improvement compared to v1 is that in v2 the local crosslets are passed
+--   on directly to input 0 of the dp_bsn_align_v2, instead of via the
+--   ring_mux and dp_demux. In this way the block period of the reference
+--   input 0 remains constant and therefore also of crosslets_sosi.sop. This
+--   ensure that there is always constant, and thus enough, time to read the
+--   aligned output. via crosslets_copi.
+-------------------------------------------------------------------------------
+
+library IEEE, common_lib, dp_lib, reorder_lib, st_lib, mm_lib, ring_lib;
+use IEEE.std_logic_1164.all;
+use common_lib.common_pkg.all;
+use common_lib.common_mem_pkg.all;
+use common_lib.common_network_layers_pkg.all;
+use dp_lib.dp_stream_pkg.all;
+use ring_lib.ring_pkg.all;
+use work.sdp_pkg.all;
+
+entity sdp_crosslets_remote_v2 is
+  generic (
+    g_P_sq             : natural := c_sdp_P_sq
+  );
+  port (
+    dp_clk             : in  std_logic;
+    dp_rst             : in  std_logic;
+
+    xsel_sosi          : in  t_dp_sosi;
+    from_ri_sosi       : in  t_dp_sosi := c_dp_sosi_rst;
+    to_ri_sosi         : out t_dp_sosi;
+
+    crosslets_sosi     : out t_dp_sosi;
+    crosslets_copi     : in  t_mem_copi := c_mem_copi_rst;
+    crosslets_cipo_arr : out t_mem_cipo_arr(g_P_sq - 1 downto 0);
+
+    mm_rst             : in  std_logic;
+    mm_clk             : in  std_logic;
+
+    reg_bsn_align_copi                       : in  t_mem_copi := c_mem_copi_rst;
+    reg_bsn_align_cipo                       : out t_mem_cipo;
+    reg_bsn_monitor_v2_bsn_align_input_copi  : in  t_mem_copi := c_mem_copi_rst;
+    reg_bsn_monitor_v2_bsn_align_input_cipo  : out t_mem_cipo;
+    reg_bsn_monitor_v2_bsn_align_output_copi : in  t_mem_copi := c_mem_copi_rst;
+    reg_bsn_monitor_v2_bsn_align_output_cipo : out t_mem_cipo
+  );
+end sdp_crosslets_remote_v2;
+
+architecture str of sdp_crosslets_remote_v2 is
+  constant c_block_size           : natural := c_sdp_N_crosslets_max * c_sdp_S_pn;
+  constant c_block_size_longwords : natural := ceil_div(c_block_size, 2);  -- 32b -> 64b
+  constant c_data_w               : natural := c_sdp_W_crosslet * c_nof_complex;
+  -- The channel field carries the index of time multiplexed crosslet packets
+  constant c_use_channel          : boolean := true;
+  constant c_channel_w            : natural := ceil_log2(g_P_sq);
+  -- With 32b data repacked in 64b one empty bit is enough. For crosslets the number
+  -- of 32b words is c_block_size is even, so empty will be 0 always. However do
+  -- support odd sizes, to be save.
+  constant c_use_empty            : boolean := true;
+  constant c_empty_w              : natural := 1;
+  -- The from_ri_sosi only carries correct packets, so error field is not used.
+  constant c_use_error            : boolean := false;
+
+  -- The size for 1 block is probably already enough as the number of blocks received
+  -- on the remote input of the mux probably have enough gap time in between. Just
+  -- to be sure to not run into issues in the future, the fifo size is increased to
+  -- buffer the maximum nof blocks per block period.
+  constant c_mux_fifo_size   : natural  := 2**ceil_log2(g_P_sq * c_block_size_longwords);
+  -- c_repack_fifo_size should be at least c_block_size_longwords / 2, as dp_repack_data
+  -- unpacks by factor 2 from 64bit to 32bit. Choose 1x to have some room.
+  constant c_repack_fifo_size  : natural  := 2**ceil_log2(1 * c_block_size_longwords);
+
+  signal xsel_data_sosi                : t_dp_sosi := c_dp_sosi_rst;
+  signal local_sosi                    : t_dp_sosi := c_dp_sosi_rst;
+  signal ring_mux_sosi                 : t_dp_sosi := c_dp_sosi_rst;
+  signal ring_mux_siso                 : t_dp_siso := c_dp_siso_rdy;
+  signal repack_fifo_sosi              : t_dp_sosi := c_dp_sosi_rst;
+  signal repack_fifo_siso              : t_dp_siso := c_dp_siso_rdy;
+  signal rx_sosi                       : t_dp_sosi := c_dp_sosi_rst;
+  signal dispatch_invert_sosi_arr      : t_dp_sosi_arr(0 to g_P_sq - 1) := (others => c_dp_sosi_rst);
+  signal dispatch_sosi_arr             : t_dp_sosi_arr(g_P_sq - 1 downto 0) := (others => c_dp_sosi_rst);
+  signal to_aligner_sosi_arr           : t_dp_sosi_arr(g_P_sq - 1 downto 0) := (others => c_dp_sosi_rst);
+begin
+  ---------------------------------------------------------------
+  -- Repack 32b to 64b
+  ---------------------------------------------------------------
+  -- repacking xsel re/im to data field.
+  p_wire_xsel_sosi : process(xsel_sosi)
+  begin
+    xsel_data_sosi <= xsel_sosi;
+    xsel_data_sosi.data(                c_sdp_W_crosslet - 1 downto 0)                <= xsel_sosi.re(c_sdp_W_crosslet - 1 downto 0);
+    xsel_data_sosi.data(c_nof_complex * c_sdp_W_crosslet - 1 downto c_sdp_W_crosslet) <= xsel_sosi.im(c_sdp_W_crosslet - 1 downto 0);
+  end process;
+
+  u_dp_repack_data_local : entity dp_lib.dp_repack_data
+  generic map (
+    g_in_dat_w       => c_data_w,
+    g_in_nof_words   => c_longword_w / c_data_w,
+    g_out_dat_w      => c_longword_w,
+    g_out_nof_words  => 1,
+    g_pipeline_ready => true  -- Needed for src_in.ready to snk_out.ready.
+  )
+  port map (
+    rst => dp_rst,
+    clk => dp_clk,
+
+    snk_in  => xsel_data_sosi,
+    src_out => local_sosi
+  );
+
+  ---------------------------------------------------------------
+  -- ring_mux
+  ---------------------------------------------------------------
+  u_ring_mux : entity ring_lib.ring_mux
+  generic map (
+    g_bsn_w        => c_dp_stream_bsn_w,
+    g_data_w       => c_longword_w,
+    g_channel_w    => c_word_w,
+    g_use_error    => c_use_error,
+    g_fifo_size    => array_init(c_mux_fifo_size, 2)
+  )
+  port map (
+    dp_clk => dp_clk,
+    dp_rst => dp_rst,
+
+    remote_sosi => from_ri_sosi,
+    local_sosi  => local_sosi,
+    mux_sosi    => ring_mux_sosi,
+    mux_siso    => ring_mux_siso
+  );
+
+  to_ri_sosi <= ring_mux_sosi;
+
+  ---------------------------------------------------------------
+  -- Repack 64b to 32b
+  ---------------------------------------------------------------
+  -- FIFO to take backpressure from u_dp_repack_data_rx
+  u_dp_fifo_sc : entity dp_lib.dp_fifo_sc
+  generic map (
+    g_data_w         => c_longword_w,
+    g_bsn_w          => c_dp_stream_bsn_w,
+    g_empty_w        => c_empty_w,
+    g_channel_w      => c_channel_w,
+    g_use_bsn        => true,
+    g_use_empty      => c_use_empty,
+    g_use_channel    => c_use_channel,
+    g_use_error      => c_use_error,
+    g_use_sync       => true,
+    g_fifo_size      => c_repack_fifo_size
+  )
+  port map (
+    rst         => dp_rst,
+    clk         => dp_clk,
+
+    snk_out     => open,
+    snk_in      => from_ri_sosi,
+
+    src_in      => repack_fifo_siso,
+    src_out     => repack_fifo_sosi
+  );
+
+  u_dp_repack_data_rx : entity dp_lib.dp_repack_data
+  generic map (
+    g_in_dat_w       => c_longword_w,
+    g_in_nof_words   => 1,
+    g_out_dat_w      => c_data_w,
+    g_out_nof_words  => c_longword_w / c_data_w,
+    g_pipeline_ready => true  -- Needed for src_in.ready to snk_out.ready.
+  )
+  port map (
+    rst => dp_rst,
+    clk => dp_clk,
+
+    snk_in  => repack_fifo_sosi,
+    snk_out => repack_fifo_siso,
+    src_out => rx_sosi
+  );
+
+  ---------------------------------------------------------------
+  -- dp_demux
+  ---------------------------------------------------------------
+  u_dp_demux : entity dp_lib.dp_demux
+  generic map (
+    g_mode              => 0,
+    g_nof_output        => g_P_sq,
+    g_remove_channel_lo => false,
+    g_sel_ctrl_invert   => true  -- TRUE when indexed (g_nof_input-1 DOWNTO 0)
+  )
+  port map (
+    rst => dp_rst,
+    clk => dp_clk,
+
+    snk_in      => rx_sosi,
+    src_out_arr => dispatch_invert_sosi_arr
+  );
+
+  dispatch_sosi_arr <= func_dp_stream_arr_reverse_range(dispatch_invert_sosi_arr);
+
+  -- Group local input stream with and remote input streams
+  to_aligner_sosi_arr(g_P_sq - 1 downto 1) <= dispatch_sosi_arr(g_P_sq - 1 downto 1);
+  to_aligner_sosi_arr(0) <= xsel_data_sosi;
+
+  ---------------------------------------------------------------
+  -- dp_bsn_aligner_v2
+  ---------------------------------------------------------------
+  u_mmp_dp_bsn_align_v2 : entity dp_lib.mmp_dp_bsn_align_v2
+  generic map(
+    -- for dp_bsn_align_v2
+    g_nof_streams             => g_P_sq,
+    g_bsn_latency_max         => 2,
+    g_nof_aligners_max        => 1,  -- 1 for Access scheme 3.
+    g_block_size              => c_block_size,
+    g_data_w                  => c_data_w,
+    g_use_mm_output           => true,
+    g_rd_latency              => 1,  -- Required for st_xst
+    -- for mms_dp_bsn_monitor_v2
+    -- Using c_sdp_N_clk_sync_timeout_xsub as g_nof_clk_per_sync is used for BSN monitor timeout.
+    g_nof_clk_per_sync        => c_sdp_N_clk_sync_timeout_xsub,
+    g_nof_input_bsn_monitors  => g_P_sq,
+    g_use_bsn_output_monitor  => true
+    )
+  port map (
+    -- Memory-mapped clock domain
+    mm_rst                  => mm_rst,
+    mm_clk                  => mm_clk,
+
+    reg_bsn_align_copi      => reg_bsn_align_copi,
+    reg_bsn_align_cipo      => reg_bsn_align_cipo,
+
+    reg_input_monitor_copi  => reg_bsn_monitor_v2_bsn_align_input_copi,
+    reg_input_monitor_cipo  => reg_bsn_monitor_v2_bsn_align_input_cipo,
+
+    reg_output_monitor_copi => reg_bsn_monitor_v2_bsn_align_output_copi,
+    reg_output_monitor_cipo => reg_bsn_monitor_v2_bsn_align_output_cipo,
+
+    -- Streaming clock domain
+    dp_rst     => dp_rst,
+    dp_clk     => dp_clk,
+
+    -- Streaming input
+    in_sosi_arr => to_aligner_sosi_arr,
+
+    -- Output via local MM interface in dp_clk domain, when g_use_mm_output = TRUE.
+    mm_sosi     => crosslets_sosi,
+    mm_copi     => crosslets_copi,
+    mm_cipo_arr => crosslets_cipo_arr
+  );
+end str;
diff --git a/applications/lofar2/libraries/sdp/src/vhdl/sdp_pkg.vhd b/applications/lofar2/libraries/sdp/src/vhdl/sdp_pkg.vhd
index 1e11aca4f8f8450f7c75170cf30f1ea9ba2e27e2..8f6701daf221360c81c7ba470e92bbf4ec5e7042 100644
--- a/applications/lofar2/libraries/sdp/src/vhdl/sdp_pkg.vhd
+++ b/applications/lofar2/libraries/sdp/src/vhdl/sdp_pkg.vhd
@@ -74,12 +74,13 @@ package sdp_pkg is
   constant c_sdp_N_pn_max                  : natural := 16;  -- max 16 PN per ring = per antenna band
   constant c_sdp_N_pol                     : natural := 2;
   constant c_sdp_N_pol_bf                  : natural := 2;
-  constant c_sdp_N_rings_sdp               : natural := 1;
+  constant c_sdp_N_rings_sdp               : natural := 1;  -- number of QSFP rings in SDP, each has N_lane = 8 lanes
   constant c_sdp_N_ring_lanes_max          : natural := 8;  -- = N_lane in doc
   constant c_sdp_N_sub                     : natural := 512;
   constant c_sdp_N_sync_rcu                : natural := 1;
   constant c_sdp_N_taps                    : natural := 16;
-  constant c_sdp_P_sq                      : natural := 9;  -- = N_pn / 2 + 1
+  constant c_sdp_P_sq                      : natural := 9;  -- = N_pn / 2 + 1 square correlator cells for XST
+  constant c_sdp_P_sum                     : natural := 2;  -- sums of two in ring beamformer adder tree
   constant c_sdp_Q_fft                     : natural := 2;
   constant c_sdp_S_pn                      : natural := 12;
   constant c_sdp_S_rcu                     : natural := 3;
@@ -589,8 +590,8 @@ package sdp_pkg is
   constant c_sdp_reg_stat_hdr_dat_bst_addr_w                 : natural := ceil_log2(c_sdp_N_beamsets) + c_sdp_reg_stat_hdr_dat_addr_w;
   constant c_sdp_reg_bsn_monitor_v2_bst_offload_addr_w       : natural := ceil_log2(c_sdp_N_beamsets) + c_sdp_reg_bsn_monitor_v2_addr_w;
   constant c_sdp_reg_bsn_monitor_v2_beamlet_output_addr_w    : natural := ceil_log2(c_sdp_N_beamsets) + c_sdp_reg_bsn_monitor_v2_addr_w;
-  constant c_sdp_reg_bsn_align_v2_bf_addr_w                  : natural := ceil_log2(c_sdp_N_beamsets) + ceil_log2(c_dual) + c_sdp_reg_bsn_align_v2_addr_w;
-  constant c_sdp_reg_bsn_monitor_v2_rx_align_bf_addr_w       : natural := ceil_log2(c_sdp_N_beamsets) + ceil_log2(c_dual) + c_sdp_reg_bsn_monitor_v2_addr_w;
+  constant c_sdp_reg_bsn_align_v2_bf_addr_w                  : natural := ceil_log2(c_sdp_N_beamsets) + ceil_log2(c_sdp_P_sum) + c_sdp_reg_bsn_align_v2_addr_w;
+  constant c_sdp_reg_bsn_monitor_v2_rx_align_bf_addr_w       : natural := ceil_log2(c_sdp_N_beamsets) + ceil_log2(c_sdp_P_sum) + c_sdp_reg_bsn_monitor_v2_addr_w;
   constant c_sdp_reg_bsn_monitor_v2_aligned_bf_addr_w        : natural := ceil_log2(c_sdp_N_beamsets) + c_sdp_reg_bsn_monitor_v2_addr_w;
   constant c_sdp_reg_ring_lane_info_bf_addr_w                : natural := ceil_log2(c_sdp_N_beamsets) + 1;
   constant c_sdp_reg_bsn_monitor_v2_ring_rx_bf_addr_w        : natural := ceil_log2(c_sdp_N_beamsets) + c_sdp_reg_bsn_monitor_v2_addr_w;
diff --git a/applications/lofar2/libraries/sdp/src/vhdl/sdp_station.vhd b/applications/lofar2/libraries/sdp/src/vhdl/sdp_station.vhd
index 727c59d7774ed8e9207a182450e5452e177994d0..dee188c0b1a2d0c1e24e21be34797d944da75c36 100644
--- a/applications/lofar2/libraries/sdp/src/vhdl/sdp_station.vhd
+++ b/applications/lofar2/libraries/sdp/src/vhdl/sdp_station.vhd
@@ -419,8 +419,8 @@ architecture str of sdp_station is
   constant c_addr_w_reg_bdo_destinations           : natural := c_sdp_reg_bdo_destinations_info_w_one;
   constant c_addr_w_reg_dp_xonoff                  : natural := 1;
   constant c_addr_w_ram_st_bst                     : natural := ceil_log2(c_sdp_S_sub_bf * c_sdp_N_pol * (c_longword_sz / c_word_sz));
-  constant c_addr_w_reg_bsn_align_v2_bf            : natural := ceil_log2(c_dual) + c_sdp_reg_bsn_align_v2_addr_w;
-  constant c_addr_w_reg_bsn_monitor_v2_rx_align_bf : natural := ceil_log2(c_dual) + c_sdp_reg_bsn_monitor_v2_addr_w;
+  constant c_addr_w_reg_bsn_align_v2_bf            : natural := ceil_log2(c_sdp_P_sum) + c_sdp_reg_bsn_align_v2_addr_w;
+  constant c_addr_w_reg_bsn_monitor_v2_rx_align_bf : natural := ceil_log2(c_sdp_P_sum) + c_sdp_reg_bsn_monitor_v2_addr_w;
   constant c_addr_w_reg_ring_lane_info_bf          : natural := 1;
 
   -- Read only sdp_info values
diff --git a/applications/lofar2/libraries/sdp/tb/vhdl/tb_sdp_beamformer_remote_ring.vhd b/applications/lofar2/libraries/sdp/tb/vhdl/tb_sdp_beamformer_remote_ring.vhd
index 421f4496540b32e8393235abbc0d4a23388d1b55..5cc1a7bc4af02db5ba94db771d8050b1368fda87 100644
--- a/applications/lofar2/libraries/sdp/tb/vhdl/tb_sdp_beamformer_remote_ring.vhd
+++ b/applications/lofar2/libraries/sdp/tb/vhdl/tb_sdp_beamformer_remote_ring.vhd
@@ -20,50 +20,313 @@
 --
 -- Author: E. Kooistra
 -- Purpose:
--- . Test bench for multiple sdp_beamformer_output.vhd + ring_lane.vhd in a ring
+-- . Test bench for multiple sdp_beamformer_remote.vhd + ring_lane.vhd +
+--   tr_10GbE in a ring
 -- Description:
 -- . https://support.astron.nl/confluence/display/L2M/L5+SDPFW+Design+Document%3A+Beamformer
 --
---   This tb is inspired by tb_lofar2_unb2c_sdp_station_bf_ring.vhd, however
+-- . This tb is inspired by tb_lofar2_unb2c_sdp_station_bf_ring.vhd, however
 --   here the purpose is to simulate the memory usage of the circular buffer
 --   in the bsn_aligner_v2 at each node.
 --
+-- . Block diagram:
+--   * tb can use one instance of tr_10Gbe to model Rx from ring and Tx to ring.
+--   * Ring lane serial links for ring nodes RN = 0 to c_last_rn:
+--
+--     tr_10gbe_ring_serial_tx_arr --> tr_10gbe_ring_serial_rx_arr after c_cable_delay
+--
+--         /<-------------------------------------------------------------\
+--         \---> 0 ---> RN - 1  --->  RN  --->  RN + 1 ---> c_last_rn --->/
+--                                    |^
+--    tr_10gbe_ring_serial_tx_arr(RN) || tr_10gbe_ring_serial_tx_arr(RN)
+--                                    v|
+--                                  tr_10Gbe
+--                                    |^
+--      tr_10gbe_ring_rx_sosi_arr(RN) || tr_10gbe_ring_tx_sosi_arr(RN)
+--                                    v|
+--                                 ring_lane
+--                                    |^
+--               from_ri_sosi_arr(RN) || to_ri_sosi_arr(RN)
+--                                    v|
+--        local_bf_sosi --> sdp_beamformer_remote --> bf_sum_sosi_arr(RN)
+--                                                    bf_sum_sosi
+--   * BSN monitors:
+--                                            RN
+--                                            |^
+--              ring_lane/ring_rx             || ring_lane/ring_tx
+--              FPGA_bf_ring_rx_latency_R(RN) || FPGA_bf_ring_tx_latency_R(RN)
+--                                            ||
+--      dp_bsn_align_v2 P_sum = 2 inputs      ||
+--      FPGA_bf_rx_align_latency_R(RN)(P_sum) ||
+--                                            ||
+--           dp_bsn_align_v2 aligned output   ||
+--              FPGA_bf_aligned_latency_R(RN) v|
+--
+-- . BF latency results from SDP-ARTS HW
+--   - with 16 ring nodes (GN = 64 is RN = 0)
+--   - 2024-03-02T21.16.33_d601da896_lofar2_unb2b_sdp_station_full_wg
+--
+--   Node:  bf_ring_rx    bf_rx_align        bf_aligned      bf_ring_tx
+--          _latency:     _latency:          _latency:       _latency:
+--   64:    -1            ( 1    -1    )     2053            3114
+--   65:    4898          ( 1    3880  )     4101            5162
+--   66:    6949          ( 1    5916  )     6149            7210
+--   67:    8998          ( 1    7960  )     8197            9258
+--   68:    11048         ( 1    10003 )     10245           11306
+--   69:    13093         ( 1    12063 )     12293           13354
+--   70:    15135         ( 1    14105 )     14341           15402
+--   71:    17174         ( 1    16154 )     16389           17450
+--   72:    19261         ( 1    18229 )     18437           19498
+--   73:    21288         ( 1    20261 )     20485           21546
+--   74:    23319         ( 1    22292 )     22533           23594
+--   75:    25367         ( 1    24359 )     24581           25642
+--   76:    27448         ( 1    26417 )     26629           27690
+--   77:    29471         ( 1    28453 )     28677           29738
+--   78:    31512         ( 1    30481 )     30725           31786
+--   79:    33567         ( 1    32537 )     32773           -1
+--
+--   Simulation latency results with this tb
+--
+--   # c_cable_delay = 0 * 6.4 ns (c_bsn_latency_first_node = 2)
+--   #
+--   # Node:  bf_ring_rx   bf_rx_align       bf_aligned     bf_ring_tx
+--   #        _latency:    _latency:         _latency:      _latency:
+--   # 0:     -1           ( 1   0 )         2053           2075
+--   # 1:     3824         ( 1   3837 )      4101           4123
+--   # 2:     5876         ( 1   5889 )      6149           6171
+--   # 3:     7926         ( 1   7939 )      8197           8219
+--   # 4:     9977         ( 1   9990 )      10245          10267
+--   # 5:     12029        ( 1   12042 )     12293          12315
+--   # 6:     14079        ( 1   14092 )     14341          14363
+--   # 7:     16108        ( 1   16121 )     16389          16411
+--   # 8:     18159        ( 1   18172 )     18437          18459
+--   # 9:     20211        ( 1   20224 )     20485          20507
+--   # 10:    22261        ( 1   22274 )     22533          22555
+--   # 11:    24312        ( 1   24325 )     24581          24603
+--   # 12:    26363        ( 1   26376 )     26629          26651
+--   # 13:    28414        ( 1   28427 )     28677          28699
+--   # 14:    30465        ( 1   30478 )     30725          30747
+--   # 15:    32493        ( 1   32506 )     32773          -1
+--
+--   # c_cable_delay = 30 * 6.4 ns (c_bsn_latency_first_node = 2)
+--   #
+--   # Node:  bf_ring_rx   bf_rx_align       bf_aligned     bf_ring_tx
+--   #        _latency:    _latency:         _latency:      _latency:
+--   # 0:     -1           ( 1    0 )        2053           2075
+--   # 1:     3862         ( 1    3875 )     4101           4123
+--   # 2:     5914         ( 1    5927 )     6149           6171
+--   # 3:     7965         ( 1    7978 )     8197           8219
+--   # 4:     10015        ( 1    10028 )    10245          10267
+--   # 5:     12067        ( 1    12080 )    12293          12315
+--   # 6:     14118        ( 1    14131 )    14341          14363
+--   # 7:     16146        ( 1    16159 )    16389          16411
+--   # 8:     18197        ( 1    18210 )    18437          18459
+--   # 9:     20249        ( 1    20262 )    20485          20507
+--   # 10:    22299        ( 1    22312 )    22533          22555
+--   # 11:    24350        ( 1    24363 )    24581          24603
+--   # 12:    26402        ( 1    26415 )    26629          26651
+--   # 13:    28452        ( 1    28465 )    28677          28699
+--   # 14:    30503        ( 1    30516 )    30725          30747
+--   # 15:    32532        ( 1    32545 )    32773          -1
+--
+--   # c_cable_delay = 30 * 6.4 ns (c_bsn_latency_first_node = 1)
+--   #
+--   # Node:  bf_ring_rx   bf_rx_align       bf_aligned      bf_ring_tx
+--   #        _latency:    _latency:         _latency:       _latency:
+--   # 0:     -1           ( 1    0 )        1029            1051
+--   # 1:     2837         ( 1    2850 )     3077            3099
+--   # 2:     4888         ( 1    4901 )     5125            5147
+--   # 3:     6939         ( 1    6952 )     7173            7195
+--   # 4:     8990         ( 1    9003 )     9221            9243
+--   # 5:     11041        ( 1    11054 )    11269           11291
+--   # 6:     13092        ( 1    13105 )    13317           13339
+--   # 7:     15143        ( 1    15156 )    15365           15387
+--   # 8:     17172        ( 1    17185 )    17413           17435
+--   # 9:     19222        ( 1    19235 )    19461           19483
+--   # 10:    21274        ( 1    21287 )    21509           21531
+--   # 11:    23325        ( 1    23338 )    23557           23579
+--   # 12:    25375        ( 1    25388 )    25605           25627
+--   # 13:    27427        ( 1    27440 )    27653           27675
+--   # 14:    29478        ( 1    29491 )    29701           29723
+--   # 15:    31507        ( 1    31520 )    31749           -1
+--
+--   - The dp_bsn_align_v2 BSN latency monitor results agree between sim an HW.
+--   - The bf_aligned_latency is exactly equal in sim and on HW, because the
+--     mmp_dp_bsn_align_v2 uses the ref_sync for the BSN monitor and also to
+--     release its BSN aligned output, so the latency only depends on internal
+--     FW buffering and latency.
+--   . The bf_aligned_latency and bf_ring_tx_latency do not depend on cable
+--     delays and are constant when read again in sim or on HW, because they
+--     only depend on fixed internal FW buffering and latency.
+--   - The ring_lane BSN latency monitor results differ between sim and HW, it
+--     is unclear why:
+--     . the ring_rx and ring_tx BSN latency monitor results are about one
+--       block of 1024 larger on HW.
+--     . on the same HW node, the bf_ring_rx_latency is about one block of 1024
+--       larger than the bf_rx_align_latency, even though they are taken at
+--       nearly the same place in the ring_rx signal path.
+--     . on the same HW node, the bf_ring_tx_latency is about one block of 1024
+--       larger than the bf_align_latency, even though they are taken at nearly
+--       the same place in the tx signal path.
+--     . the ring_rx and ring_tx BSN latency monitor results for XST do not
+--       show a one block is 1024 offset.
+--     TODO:
+--     . Assume the ring_lane latencies are one block is 1024 too high, and
+--       assume that the bf_rx_align_latency is correct and reflects the actual
+--       packet latency.
+--     . The ring_rx and ring_tx both use func_ring_nof_hops_to_source_rn() and
+--       hops = sosi.channel to get monitor_sosi, maybe there occurs an offset
+--       there.
+--     . The ring_rx and ring_tx both use dp_demux.vhd, maybe that causes a one
+--       block is 1024 shift in sosi.sync.
+--
 -- Usage:
--- > as 8
+-- > as 3 or more
+-- > add wave -position insertpoint sim:/tb_sdp_beamformer_remote_ring/bf_sum_sosi_arr
 -- > run -a
 -------------------------------------------------------------------------------
 
-library IEEE, common_lib, dp_lib, reorder_lib;
+library IEEE, common_lib, dp_lib, ring_lib, tr_10GbE_lib, tech_pll_lib;
 use IEEE.std_logic_1164.all;
 use common_lib.common_pkg.all;
 use common_lib.common_mem_pkg.all;
 use common_lib.tb_common_pkg.all;
 use common_lib.tb_common_mem_pkg.all;
+use common_lib.common_str_pkg.all;
 use dp_lib.dp_stream_pkg.all;
+use ring_lib.ring_pkg.all;
+use tech_pll_lib.tech_pll_component_pkg.all;
 use work.sdp_pkg.all;
 use work.tb_sdp_pkg.all;
 
 entity tb_sdp_beamformer_remote_ring is
   generic (
-    g_nof_rn             : natural := 16  -- number of nodes in the ring
+    g_nof_rn    : natural := 4;  -- number of nodes in the ring
+    g_nof_sync  : natural := 2
   );
 end tb_sdp_beamformer_remote_ring;
 
 architecture tb of tb_sdp_beamformer_remote_ring is
   constant c_dp_clk_period : time := 5 ns;  -- 200 MHz
   constant c_mm_clk_period : time := 1 ns;  -- fast MM clk to speed up simulation
+  constant c_sa_clk_period : time := tech_pll_clk_644_period;  -- 644MHz
+
+  -- Apply cable delay in tech_pll_clk_156_period units, to remain aligned with tr_10GbE sim model
+  -- . Choose c_cable_delay = 30 * 6.4 ~= 192 ns ~= 38 dp_clk of 5 ns, to match delay seen on HW
+  -- . Maximum c_cable_delay <= 186 * 6.4 = 1210 ns ~= 242 dp_clk of 5 ns in simulation with
+  --   g_nof_rn = 16. For larger c_cable_delay the bf_sum_sosi.data goes wrong. The maximum
+  --   c_cable_delay depends a little bit on g_nof_rn, for g_nof_rn = 2 the data goes wrong when
+  --   c_cable_delay >= 190.
+  constant c_clk_156_period  : time := tech_pll_clk_156_period;  -- 6.400020 ns ~= 156.25 MHz
+  constant c_nof_delay       : natural := 30; --286;
+  constant c_cable_delay     : time := c_clk_156_period * c_nof_delay;
+
+  -- BF data
+  constant c_block_period              : natural := c_sdp_N_fft;
+  constant c_block_size                : natural := c_sdp_S_sub_bf * c_sdp_N_pol_bf;
+  constant c_gap_size                  : natural := c_block_period - c_block_size;
+  -- choose sync interval somewhat longer than maximum BF ring latency
+  constant c_nof_blocks_per_sync       : natural := largest(10, (g_nof_rn + 1) * 2);
+  constant c_local_bf_re               : integer := 1;
+  constant c_local_bf_im               : integer := 2;
+
+  -- Ring lane packets
+  constant c_last_rn                   : natural := g_nof_rn - 1;  -- first ring node has index RN = 0 by definition.
+  constant c_use_cable                 : std_logic := '1';  -- '0' ring via PCB traces, '1' ring via QSFP cables
+  constant c_lane_payload_nof_longwords_bf  : natural := (c_block_size * 9) / 16;  -- beamlet block size repacked
+                                              -- from 36b to 64b (9/16 = 36/64), 488 * 2 * 9 / 16 = 549 longwords
+  constant c_lane_packet_nof_longwords_max  : natural := c_lane_payload_nof_longwords_bf + c_ring_dp_hdr_field_size;
+                                              -- = 549 + 3 = 552
+  constant c_fifo_tx_fill_margin       : natural := 10;  -- >= c_fifo_fill_margin = 6 that is used in dp_fifo_fill_eop
+  constant c_fifo_tx_size_ring : natural := true_log_pow2(c_lane_packet_nof_longwords_max + c_fifo_tx_fill_margin);
+                                            -- = 552 + 6 --> 1024
+  constant c_fifo_tx_fill_ring : natural := c_fifo_tx_size_ring - c_fifo_tx_fill_margin;
+                                            -- = maximum fill level, so rely on eop
+  constant c_err_bi                    : natural := 0;
+  constant c_nof_err_counts            : natural := 8;
+  constant c_bsn_at_sync_check_channel : natural := 1;
+  constant c_validate_channel          : boolean := true;
+  constant c_validate_channel_mode     : string  := "=";
+  constant c_sync_timeout              : natural := c_block_period * (c_nof_blocks_per_sync + 1);
+
+  -- Timeout tb if there is no output bf_sum_sosi
+  constant c_tb_timeout                : time := (g_nof_sync + 1) * c_sync_timeout * c_dp_clk_period;
+
+  -- Address widths of a single MM instance
+  constant c_addr_w_reg_ring_lane_info_bf          : natural := 1;
+
+  signal mm_init                : std_logic := '1';
+  signal tb_end                 : std_logic := '0';
+  signal dp_clk                 : std_logic := '1';
+  signal dp_rst                 : std_logic;
+  signal mm_clk                 : std_logic := '1';
+  signal mm_rst                 : std_logic;
+  signal SA_CLK                 : std_logic := '1';
+  signal tr_ref_clk_312         : std_logic := '0';
+  signal tr_ref_clk_156         : std_logic := '0';
+  signal tr_ref_rst_156         : std_logic := '0';
 
-  constant c_last_rn             : natural := g_nof_rn - 1;  -- first ring node has index RN = 0 by definition.
+  signal stimuli_rst            : std_logic;
+  signal stimuli_end            : std_logic;
 
-  signal mm_init      : std_logic := '1';
-  signal tb_end       : std_logic := '0';
-  signal dp_clk       : std_logic := '1';
-  signal dp_rst       : std_logic;
-  signal mm_clk       : std_logic := '1';
-  signal mm_rst       : std_logic;
+  signal stimuli_sosi           : t_dp_sosi;
+  signal local_bf_sosi          : t_dp_sosi;
+  signal bf_bs_sosi             : t_dp_sosi;
+  signal from_ri_sosi_arr       : t_dp_sosi_arr(c_last_rn downto 0);
+  signal to_ri_sosi_arr         : t_dp_sosi_arr(c_last_rn downto 0);
+  signal bf_sum_sosi_arr        : t_dp_sosi_arr(c_last_rn downto 0);
+  signal bf_sum_sosi            : t_dp_sosi;
 
-  signal rn_index     : natural range 0 to c_sdp_N_pn_max - 1 := 0;
+  -- 10GbE ring
+  signal tr_10gbe_ring_rx_sosi_arr    : t_dp_sosi_arr(c_last_rn downto 0) := (others => c_dp_sosi_rst);
+  signal tr_10gbe_ring_tx_sosi_arr    : t_dp_sosi_arr(c_last_rn downto 0) := (others => c_dp_sosi_rst);
+  signal tr_10gbe_ring_serial_rx_arr  : std_logic_vector(c_last_rn downto 0) := (others => '0');
+  signal tr_10gbe_ring_serial_tx_arr  : std_logic_vector(c_last_rn downto 0) := (others => '0');
 
+  -- BF ring MM registers
+  signal reg_ring_lane_info_bf_copi_arr         : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_ring_lane_info_bf_cipo_arr         : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_ring_lane_info_bf_copi             : t_mem_copi := c_mem_copi_rst;
+  signal reg_ring_lane_info_bf_cipo             : t_mem_cipo := c_mem_cipo_rst;
+  signal reg_bsn_monitor_v2_ring_rx_bf_copi_arr : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_bsn_monitor_v2_ring_rx_bf_cipo_arr : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_bsn_monitor_v2_ring_rx_bf_copi     : t_mem_copi := c_mem_copi_rst;
+  signal reg_bsn_monitor_v2_ring_rx_bf_cipo     : t_mem_cipo := c_mem_cipo_rst;
+  signal reg_bsn_monitor_v2_ring_tx_bf_copi_arr : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_bsn_monitor_v2_ring_tx_bf_cipo_arr : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_bsn_monitor_v2_ring_tx_bf_copi     : t_mem_copi := c_mem_copi_rst;
+  signal reg_bsn_monitor_v2_ring_tx_bf_cipo     : t_mem_cipo := c_mem_cipo_rst;
+  signal reg_dp_block_validate_err_bf_copi_arr  : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_dp_block_validate_err_bf_cipo_arr  : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_dp_block_validate_err_bf_copi      : t_mem_copi := c_mem_copi_rst;
+  signal reg_dp_block_validate_err_bf_cipo      : t_mem_cipo := c_mem_cipo_rst;
+  signal reg_dp_block_validate_bsn_at_sync_bf_copi_arr : t_mem_copi_arr(c_last_rn downto 0) :=
+                                                         (others => c_mem_copi_rst);
+  signal reg_dp_block_validate_bsn_at_sync_bf_cipo_arr : t_mem_cipo_arr(c_last_rn downto 0) :=
+                                                         (others => c_mem_cipo_rst);
+  signal reg_dp_block_validate_bsn_at_sync_bf_copi     : t_mem_copi := c_mem_copi_rst;
+  signal reg_dp_block_validate_bsn_at_sync_bf_cipo     : t_mem_cipo := c_mem_cipo_rst;
+  -- BF ring MM points
+  signal FPGA_bf_ring_nof_transport_hops_R       : t_natural_arr(c_last_rn downto 0);
+  signal FPGA_bf_ring_rx_latency_R               : t_integer_arr(c_last_rn downto 0);
+  signal FPGA_bf_ring_tx_latency_R               : t_integer_arr(c_last_rn downto 0);
+
+  -- BSN aligner MM registers
+  signal reg_bsn_align_v2_bf_copi_arr            : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_bsn_align_v2_bf_cipo_arr            : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_bsn_align_v2_bf_copi                : t_mem_copi := c_mem_copi_rst;
+  signal reg_bsn_align_v2_bf_cipo                : t_mem_cipo := c_mem_cipo_rst;
+  signal reg_bsn_monitor_v2_bf_rx_align_copi_arr : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_bsn_monitor_v2_bf_rx_align_cipo_arr : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_bsn_monitor_v2_bf_rx_align_copi     : t_mem_copi := c_mem_copi_rst;
+  signal reg_bsn_monitor_v2_bf_rx_align_cipo     : t_mem_cipo := c_mem_cipo_rst;
+  signal reg_bsn_monitor_v2_bf_aligned_copi_arr  : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_bsn_monitor_v2_bf_aligned_cipo_arr  : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_bsn_monitor_v2_bf_aligned_copi      : t_mem_copi := c_mem_copi_rst;
+  signal reg_bsn_monitor_v2_bf_aligned_cipo      : t_mem_cipo := c_mem_cipo_rst;
+  -- BSN aligner Monitor Points
+  signal FPGA_bf_rx_align_latency_R               : t_integer_2arr_2(c_last_rn downto 0);  -- c_sdp_P_sum = 2
+  signal FPGA_bf_aligned_latency_R                : t_integer_arr(c_last_rn downto 0);
 begin
   dp_rst <= '1', '0' after c_dp_clk_period * 7;
   dp_clk <= (not dp_clk) or tb_end after c_dp_clk_period / 2;
@@ -71,29 +334,208 @@ begin
   mm_rst <= '1', '0' after c_mm_clk_period * 7;
   mm_clk <= (not mm_clk) or tb_end after c_mm_clk_period / 2;
 
+  -- Wait for tr_10GbE to be active
+  stimuli_rst <= '1', '0' after 15 us;
+
+  SA_CLK <= not SA_CLK after c_sa_clk_period / 2;  -- Serial Gigabit IO sa clock (644 MHz)
+
+  -- Generate local BF stream, use same for all nodes
+  u_stimuli : entity dp_lib.dp_stream_stimuli
+  generic map (
+    g_sync_period => c_nof_blocks_per_sync,
+    g_nof_repeat  => c_nof_blocks_per_sync * g_nof_sync,
+    g_pkt_len     => c_block_size,
+    g_pkt_gap     => c_gap_size
+  )
+  port map (
+    rst               => stimuli_rst,
+    clk               => dp_clk,
+    -- Generate stimuli
+    src_out           => stimuli_sosi,
+    -- End of stimuli
+    tb_end            => stimuli_end
+  );
+
+  -- Use constant beamlet data to ease verification of (intermediate) beamlet sums at each node
+  p_local_bf_sosi : process(stimuli_sosi)
+  begin
+    local_bf_sosi <= stimuli_sosi;
+    local_bf_sosi.data <= TO_DP_SDATA(0);
+    local_bf_sosi.re <= TO_DP_DSP_DATA(c_local_bf_re);
+    local_bf_sosi.im <= TO_DP_DSP_DATA(c_local_bf_im);
+    local_bf_sosi.channel <= TO_DP_CHANNEL(0);
+    local_bf_sosi.err <= TO_DP_ERROR(0);
+  end process;
+
+  bf_bs_sosi <= local_bf_sosi;
+  bf_sum_sosi <= bf_sum_sosi_arr(c_last_rn);
+
   p_mm : process
-    variable v_offset : natural;
+    variable v_span               : natural;
+    variable v_span_node          : natural;
+    variable v_offset             : natural;
+    variable v_transport_nof_hops : natural;
   begin
     proc_common_wait_until_low(dp_clk, mm_rst);
     proc_common_wait_some_cycles(mm_clk, 10);
 
-    proc_common_wait_cross_clock_domain_latency(c_mm_clk_period, c_dp_clk_period, c_common_cross_clock_domain_latency * 2);
+    proc_common_wait_cross_clock_domain_latency(c_mm_clk_period, c_dp_clk_period,
+                                                c_common_cross_clock_domain_latency * 2);
+    ---------------------------------------------------------------------------
+    -- Setup transport nof hops for RN = 0:15 to [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
+    ---------------------------------------------------------------------------
+    -- Write FPGA_bf_ring_nof_transport_hops_RW = ring_lane_info.transport_nof_hops
+    v_span := 2**c_addr_w_reg_ring_lane_info_bf;
+    for RN in 0 to c_last_rn LOOP
+      v_offset := 1 + RN * v_span;
+      v_transport_nof_hops := 1;
+      if RN = c_last_rn then
+        v_transport_nof_hops := 0;
+      end if;
+      proc_mem_mm_bus_wr(v_offset, v_transport_nof_hops, mm_clk,
+                         reg_ring_lane_info_bf_cipo, reg_ring_lane_info_bf_copi);
+    end loop;
+    proc_common_wait_cross_clock_domain_latency(c_mm_clk_period, c_dp_clk_period,
+                                                c_common_cross_clock_domain_latency * 2);
+    -- Readback FPGA_bf_ring_nof_transport_hops_R
+    for RN in 0 to c_last_rn LOOP
+      v_offset := 1 + RN * v_span;
+      proc_mem_mm_bus_rd(v_offset, mm_clk, reg_ring_lane_info_bf_cipo, reg_ring_lane_info_bf_copi);
+      proc_mem_mm_bus_rd_latency(1, mm_clk);
+      FPGA_bf_ring_nof_transport_hops_R(RN) <= TO_UINT(reg_ring_lane_info_bf_cipo.rddata(c_word_w - 1 downto 0));
+    end loop;
+
+    ---------------------------------------------------------------------------
+    -- Wait until second bf_sum_sosi.sync
+    ---------------------------------------------------------------------------
+    proc_common_wait_until_hi_lo(dp_clk, bf_sum_sosi.sync);
+    proc_common_wait_until_hi_lo(dp_clk, bf_sum_sosi.sync);
+
+    ---------------------------------------------------------------------------
+    -- Read BSN monitors
+    ---------------------------------------------------------------------------
+    v_span := 2**c_sdp_reg_bsn_monitor_v2_addr_w;
+    -- Read FPGA_bf_ring_rx_latency_R
+    for RN in 0 to c_last_rn LOOP
+      v_offset := 6 + RN * v_span;
+      proc_mem_mm_bus_rd(v_offset, mm_clk, reg_bsn_monitor_v2_ring_rx_bf_cipo, reg_bsn_monitor_v2_ring_rx_bf_copi);
+      proc_mem_mm_bus_rd_latency(1, mm_clk);
+      FPGA_bf_ring_rx_latency_R(RN) <= TO_SINT(reg_bsn_monitor_v2_ring_rx_bf_cipo.rddata(c_word_w - 1 downto 0));
+    end loop;
+    -- Read FPGA_bf_rx_align_latency_R, for both c_sdp_P_sum = 2 inputs per RN
+    v_span_node := true_log_pow2(c_sdp_P_sum) * v_span;
+    for RN in 0 to c_last_rn LOOP
+      for P in 0 to c_sdp_P_sum - 1 loop
+        v_offset := 6 + RN * v_span_node + P * v_span;
+        proc_mem_mm_bus_rd(v_offset, mm_clk, reg_bsn_monitor_v2_bf_rx_align_cipo, reg_bsn_monitor_v2_bf_rx_align_copi);
+        proc_mem_mm_bus_rd_latency(1, mm_clk);
+        FPGA_bf_rx_align_latency_R(RN)(P) <= TO_SINT(reg_bsn_monitor_v2_bf_rx_align_cipo.rddata(c_word_w - 1 downto 0));
+      end loop;
+    end loop;
+    -- Read FPGA_bf_aligned_latency_R
+    for RN in 0 to c_last_rn LOOP
+      v_offset := 6 + RN * v_span;
+      proc_mem_mm_bus_rd(v_offset, mm_clk, reg_bsn_monitor_v2_bf_aligned_cipo, reg_bsn_monitor_v2_bf_aligned_copi);
+      proc_mem_mm_bus_rd_latency(1, mm_clk);
+      FPGA_bf_aligned_latency_R(RN) <= TO_SINT(reg_bsn_monitor_v2_bf_aligned_cipo.rddata(c_word_w - 1 downto 0));
+    end loop;
+    -- Read FPGA_bf_ring_tx_latency_R
+    for RN in 0 to c_last_rn LOOP
+      v_offset := 6 + RN * v_span;
+      proc_mem_mm_bus_rd(v_offset, mm_clk, reg_bsn_monitor_v2_ring_tx_bf_cipo, reg_bsn_monitor_v2_ring_tx_bf_copi);
+      proc_mem_mm_bus_rd_latency(1, mm_clk);
+      FPGA_bf_ring_tx_latency_R(RN) <= TO_SINT(reg_bsn_monitor_v2_ring_tx_bf_cipo.rddata(c_word_w - 1 downto 0));
+    end loop;
+
+    ---------------------------------------------------------------------------
+    -- Wait until end of simulation
+    ---------------------------------------------------------------------------
     mm_init <= '0';
+
+    proc_common_wait_until_high(dp_clk, stimuli_end);
+    proc_common_wait_some_cycles(dp_clk, 1000);
+
+    ---------------------------------------------------------------------------
+    -- Print latency results
+    ---------------------------------------------------------------------------
+    print_str("c_cable_delay = " & int_to_str(c_nof_delay) & " * 6.4 ns");
+    print_str("");
+    print_str("Node:  bf_ring_rx    bf_rx_align          bf_aligned      bf_ring_tx");
+    print_str("       _latency:     _latency:            _latency:       _latency:");
+    for RN in 0 to c_last_rn loop
+       print_str(int_to_str(RN) & ":     " &
+                 int_to_str(FPGA_bf_ring_rx_latency_R(RN)) & "          ( " &
+                 int_to_str(FPGA_bf_rx_align_latency_R(RN)(0)) & "    " &
+                 int_to_str(FPGA_bf_rx_align_latency_R(RN)(1)) & " )        " &
+                 int_to_str(FPGA_bf_aligned_latency_R(RN)) & "            " &
+                 int_to_str(FPGA_bf_ring_tx_latency_R(RN)));
+    end Loop;
+
+    tb_end <= '1';
     wait;
   end process;
 
+  -- End the tb simulation
+  proc_common_timeout_failure(c_tb_timeout, tb_end);  -- ERROR: end simulation if it fails to end in time
+  proc_common_stop_simulation(tb_end);  -- OK: end simulation
 
   ------------------------------------------------------------------------------
   -- DUT
   ------------------------------------------------------------------------------
   gen_dut : for RN in 0 to c_last_rn generate
-    -- Ring connections between nodes 0:c_last_rn,0
+    -- Connect ring wires between the nodes
+    wire_ring : if RN > 0 generate
+      tr_10gbe_ring_serial_rx_arr(RN) <= transport tr_10gbe_ring_serial_tx_arr(RN - 1) after c_cable_delay;
+    end generate;
+    close_ring : if RN = 0 generate
+      tr_10gbe_ring_serial_rx_arr(0) <= transport tr_10gbe_ring_serial_tx_arr(c_last_rn) after c_cable_delay;
+    end generate;
+
+    -- tr_10GbE access at each node, all via front_io QSFP[0]
+    u_tr_10GbE_ring: entity tr_10GbE_lib.tr_10GbE
+    generic map (
+      g_sim           => true,
+      g_sim_level     => 1,
+      g_nof_macs      => 1,
+      g_direction     => "TX_RX",
+      g_tx_fifo_fill  => c_fifo_tx_fill_ring,
+      g_tx_fifo_size  => c_fifo_tx_size_ring
+    )
+    port map (
+      -- Transceiver PLL reference clock
+      tr_ref_clk_644        => SA_CLK,
+      tr_ref_clk_312        => tr_ref_clk_312,
+      tr_ref_clk_156        => tr_ref_clk_156,
+      tr_ref_rst_156        => tr_ref_rst_156,
+
+      -- MM interface
+      mm_rst                => mm_rst,
+      mm_clk                => mm_clk,
+
+      reg_mac_mosi          => c_mem_copi_rst,
+      reg_mac_miso          => open,
+      reg_eth10g_mosi       => c_mem_copi_rst,
+      reg_eth10g_miso       => open,
+
+      -- DP interface
+      dp_rst                => dp_rst,
+      dp_clk                => dp_clk,
+
+      src_out_arr           => tr_10gbe_ring_rx_sosi_arr(RN downto RN),
+      snk_in_arr            => tr_10gbe_ring_tx_sosi_arr(RN downto RN),
+
+      -- Serial IO
+      serial_tx_arr         => tr_10gbe_ring_serial_tx_arr(RN downto RN),
+      serial_rx_arr         => tr_10gbe_ring_serial_rx_arr(RN downto RN)
+    );
+
+    -- Ring lane access at each node
     u_ring_lane_bf : entity ring_lib.ring_lane
       generic map (
         g_lane_direction            => 1,  -- transport in positive RN direction.
         g_lane_data_w               => c_longword_w,
         g_lane_packet_length        => c_lane_payload_nof_longwords_bf,
-        g_lane_total_nof_packets_w  => c_lane_total_nof_packets_w,
+        g_lane_total_nof_packets_w  => 32,
         g_use_dp_layer              => true,
         g_nof_rx_monitors           => 1,
         g_nof_tx_monitors           => 1,
@@ -110,56 +552,159 @@ begin
         dp_clk => dp_clk,
         dp_rst => dp_rst,
 
-        from_lane_sosi     => bf_from_ri_sosi_arr(beamset_id),
-        to_lane_sosi       => bf_to_ri_sosi_arr(beamset_id),
-        lane_rx_cable_sosi => lane_rx_cable_sosi_arr(1 + beamset_id),
-        lane_rx_board_sosi => lane_rx_board_sosi_arr(1 + beamset_id),
-        lane_tx_cable_sosi => lane_tx_cable_sosi_arr(1 + beamset_id),
-        lane_tx_board_sosi => lane_tx_board_sosi_arr(1 + beamset_id),
+        from_lane_sosi     => from_ri_sosi_arr(RN),
+        to_lane_sosi       => to_ri_sosi_arr(RN),
+        lane_rx_cable_sosi => tr_10gbe_ring_rx_sosi_arr(RN),
+        lane_rx_board_sosi => c_dp_sosi_rst,
+        lane_tx_cable_sosi => tr_10gbe_ring_tx_sosi_arr(RN),
+        lane_tx_board_sosi => open,
         bs_sosi            => bf_bs_sosi,  -- used for bsn and sync
 
-        reg_ring_lane_info_copi                => reg_ring_lane_info_bf_copi_arr(beamset_id),
-        reg_ring_lane_info_cipo                => reg_ring_lane_info_bf_cipo_arr(beamset_id),
-        reg_bsn_monitor_v2_ring_rx_copi        => reg_bsn_monitor_v2_ring_rx_bf_copi_arr(beamset_id),
-        reg_bsn_monitor_v2_ring_rx_cipo        => reg_bsn_monitor_v2_ring_rx_bf_cipo_arr(beamset_id),
-        reg_bsn_monitor_v2_ring_tx_copi        => reg_bsn_monitor_v2_ring_tx_bf_copi_arr(beamset_id),
-        reg_bsn_monitor_v2_ring_tx_cipo        => reg_bsn_monitor_v2_ring_tx_bf_cipo_arr(beamset_id),
-        reg_dp_block_validate_err_copi         => reg_dp_block_validate_err_bf_copi_arr(beamset_id),
-        reg_dp_block_validate_err_cipo         => reg_dp_block_validate_err_bf_cipo_arr(beamset_id),
-        reg_dp_block_validate_bsn_at_sync_copi => reg_dp_block_validate_bsn_at_sync_bf_copi_arr(beamset_id),
-        reg_dp_block_validate_bsn_at_sync_cipo => reg_dp_block_validate_bsn_at_sync_bf_cipo_arr(beamset_id),
-
-        this_rn   => this_rn,
-        N_rn      => ring_info.N_rn,
-        rx_select => ring_info.use_cable_to_previous_rn,
-        tx_select => ring_info.use_cable_to_next_rn
+        reg_ring_lane_info_copi                => reg_ring_lane_info_bf_copi_arr(RN),
+        reg_ring_lane_info_cipo                => reg_ring_lane_info_bf_cipo_arr(RN),
+        reg_bsn_monitor_v2_ring_rx_copi        => reg_bsn_monitor_v2_ring_rx_bf_copi_arr(RN),
+        reg_bsn_monitor_v2_ring_rx_cipo        => reg_bsn_monitor_v2_ring_rx_bf_cipo_arr(RN),
+        reg_bsn_monitor_v2_ring_tx_copi        => reg_bsn_monitor_v2_ring_tx_bf_copi_arr(RN),
+        reg_bsn_monitor_v2_ring_tx_cipo        => reg_bsn_monitor_v2_ring_tx_bf_cipo_arr(RN),
+        reg_dp_block_validate_err_copi         => reg_dp_block_validate_err_bf_copi_arr(RN),
+        reg_dp_block_validate_err_cipo         => reg_dp_block_validate_err_bf_cipo_arr(RN),
+        reg_dp_block_validate_bsn_at_sync_copi => reg_dp_block_validate_bsn_at_sync_bf_copi_arr(RN),
+        reg_dp_block_validate_bsn_at_sync_cipo => reg_dp_block_validate_bsn_at_sync_bf_cipo_arr(RN),
+
+        this_rn   => to_uvec(RN, c_byte_w),
+        N_rn      => to_uvec(g_nof_rn, c_byte_w),
+        rx_select => c_use_cable,
+        tx_select => c_use_cable
       );
 
     -- Intermediate BF alignment and summation at each node
     u_sdp_beamformer_remote : entity work.sdp_beamformer_remote
+      generic map (
+        g_nof_aligners_max  => g_nof_rn
+      )
       port map (
         dp_clk        => dp_clk,
         dp_rst        => dp_rst,
 
-        rn_index      => rn_index,
+        rn_index      => RN,
 
-        local_bf_sosi : in  t_dp_sosi;
-        from_ri_sosi  : in  t_dp_sosi;
-        to_ri_sosi    : out t_dp_sosi;
-        bf_sum_sosi   : out t_dp_sosi;
+        local_bf_sosi => local_bf_sosi,  -- all nodes use same local reference data
+        from_ri_sosi  => from_ri_sosi_arr(RN),
+        to_ri_sosi    => to_ri_sosi_arr(RN),
+        bf_sum_sosi   => bf_sum_sosi_arr(RN),
 
-        mm_rst        : in  std_logic;
-        mm_clk        : in  std_logic;
+        mm_rst        => mm_rst,
+        mm_clk        => mm_clk,
 
-        reg_bsn_align_copi : in  t_mem_copi := c_mem_copi_rst;
-        reg_bsn_align_cipo : out t_mem_cipo;
+        reg_bsn_align_copi                       => reg_bsn_align_v2_bf_copi_arr(RN),
+        reg_bsn_align_cipo                       => reg_bsn_align_v2_bf_cipo_arr(RN),
+        reg_bsn_monitor_v2_bsn_align_input_copi  => reg_bsn_monitor_v2_bf_rx_align_copi_arr(RN),
+        reg_bsn_monitor_v2_bsn_align_input_cipo  => reg_bsn_monitor_v2_bf_rx_align_cipo_arr(RN),
+        reg_bsn_monitor_v2_bsn_align_output_copi => reg_bsn_monitor_v2_bf_aligned_copi_arr(RN),
+        reg_bsn_monitor_v2_bsn_align_output_cipo => reg_bsn_monitor_v2_bf_aligned_cipo_arr(RN)
+      );
+  end generate;  -- gen_dut
 
-        reg_bsn_monitor_v2_bsn_align_input_copi  : in  t_mem_copi := c_mem_copi_rst;
-        reg_bsn_monitor_v2_bsn_align_input_cipo  : out t_mem_cipo;
+  ------------------------------------------------------------------------------
+  -- Verify bf_sum_sosi_arr at every node, to check that no packets were lost
+  ------------------------------------------------------------------------------
+  p_verify_bf_sum : process(dp_clk)
+  begin
+    for RN in 0 to c_last_rn Loop
+      if bf_sum_sosi_arr(RN).valid = '1' then
+        assert TO_SINT(bf_sum_sosi_arr(RN).re) = (RN + 1) * c_local_bf_re report "Wrong BF re sum at node " & int_to_str(RN) severity error;
+        assert TO_SINT(bf_sum_sosi_arr(RN).im) = (RN + 1) * c_local_bf_im report "Wrong BF im sum at node " & int_to_str(RN) severity error;
+      end if;
+    end loop;
+  end process;
 
-        reg_bsn_monitor_v2_bsn_align_output_copi : in  t_mem_copi := c_mem_copi_rst;
-        reg_bsn_monitor_v2_bsn_align_output_cipo : out t_mem_cipo
-      );
-end generate;  -- gen_dut
+  ------------------------------------------------------------------------------
+  -- 10GbE clocks
+  ------------------------------------------------------------------------------
+  u_tech_pll_xgmii_mac_clocks : entity tech_pll_lib.tech_pll_xgmii_mac_clocks
+  port map (
+    refclk_644 => SA_CLK,
+    rst_in     => mm_rst,
+    clk_156    => tr_ref_clk_156,
+    clk_312    => tr_ref_clk_312,
+    rst_156    => tr_ref_rst_156,
+    rst_312    => open
+  );
+
+  ------------------------------------------------------------------------------
+  -- MM bus multiplexers
+  ------------------------------------------------------------------------------
+  -- Use common_mem_mux to avoid (vcom-1450) Actual (indexed name) for formal "mm_miso" is not a static signal name.
+  -- Use downto range for _arr, to match downto range of mosi_arr.
+  u_mem_mux_reg_ring_lane_info_bf : entity common_lib.common_mem_mux
+  generic map (
+    g_nof_mosi    => g_nof_rn,
+    g_mult_addr_w => c_addr_w_reg_ring_lane_info_bf
+  )
+  port map (
+    mosi     => reg_ring_lane_info_bf_copi,
+    miso     => reg_ring_lane_info_bf_cipo,
+    mosi_arr => reg_ring_lane_info_bf_copi_arr,
+    miso_arr => reg_ring_lane_info_bf_cipo_arr
+  );
+
+  u_mem_mux_reg_bsn_monitor_v2_ring_rx_bf : entity common_lib.common_mem_mux
+  generic map (
+    g_nof_mosi    => g_nof_rn,
+    g_mult_addr_w => c_sdp_reg_bsn_monitor_v2_addr_w
+  )
+  port map (
+    mosi     => reg_bsn_monitor_v2_ring_rx_bf_copi,
+    miso     => reg_bsn_monitor_v2_ring_rx_bf_cipo,
+    mosi_arr => reg_bsn_monitor_v2_ring_rx_bf_copi_arr,
+    miso_arr => reg_bsn_monitor_v2_ring_rx_bf_cipo_arr
+  );
 
+  u_mem_mux_reg_bsn_monitor_v2_ring_tx_bf : entity common_lib.common_mem_mux
+  generic map (
+    g_nof_mosi    => g_nof_rn,
+    g_mult_addr_w => c_sdp_reg_bsn_monitor_v2_addr_w
+  )
+  port map (
+    mosi     => reg_bsn_monitor_v2_ring_tx_bf_copi,
+    miso     => reg_bsn_monitor_v2_ring_tx_bf_cipo,
+    mosi_arr => reg_bsn_monitor_v2_ring_tx_bf_copi_arr,
+    miso_arr => reg_bsn_monitor_v2_ring_tx_bf_cipo_arr
+  );
+
+  u_mem_mux_reg_bsn_monitor_v2_bf_rx_align : entity common_lib.common_mem_mux
+  generic map (
+    g_nof_mosi    => g_nof_rn,
+    g_mult_addr_w => c_sdp_reg_bsn_monitor_v2_addr_w + ceil_log2(c_sdp_P_sum)
+  )
+  port map (
+    mosi     => reg_bsn_monitor_v2_bf_rx_align_copi,
+    miso     => reg_bsn_monitor_v2_bf_rx_align_cipo,
+    mosi_arr => reg_bsn_monitor_v2_bf_rx_align_copi_arr,
+    miso_arr => reg_bsn_monitor_v2_bf_rx_align_cipo_arr
+  );
+
+  u_mem_mux_reg_bsn_monitor_v2_bf_aligned : entity common_lib.common_mem_mux
+  generic map (
+    g_nof_mosi    => g_nof_rn,
+    g_mult_addr_w => c_sdp_reg_bsn_monitor_v2_addr_w
+  )
+  port map (
+    mosi     => reg_bsn_monitor_v2_bf_aligned_copi,
+    miso     => reg_bsn_monitor_v2_bf_aligned_cipo,
+    mosi_arr => reg_bsn_monitor_v2_bf_aligned_copi_arr,
+    miso_arr => reg_bsn_monitor_v2_bf_aligned_cipo_arr
+  );
+
+  u_mem_mux_reg_bsn_align_v2_bf : entity common_lib.common_mem_mux
+  generic map (
+    g_nof_mosi    => g_nof_rn,
+    g_mult_addr_w => c_sdp_reg_bsn_align_v2_addr_w
+  )
+  port map (
+    mosi     => reg_bsn_align_v2_bf_copi,
+    miso     => reg_bsn_align_v2_bf_cipo,
+    mosi_arr => reg_bsn_align_v2_bf_copi_arr,
+    miso_arr => reg_bsn_align_v2_bf_cipo_arr
+  );
 end tb;
diff --git a/applications/lofar2/libraries/sdp/tb/vhdl/tb_sdp_crosslets_remote_ring.vhd b/applications/lofar2/libraries/sdp/tb/vhdl/tb_sdp_crosslets_remote_ring.vhd
new file mode 100644
index 0000000000000000000000000000000000000000..1ec7de0db3d269388ee83389ea985b8da6ebf426
--- /dev/null
+++ b/applications/lofar2/libraries/sdp/tb/vhdl/tb_sdp_crosslets_remote_ring.vhd
@@ -0,0 +1,938 @@
+-------------------------------------------------------------------------------
+--
+-- Copyright 2024
+-- ASTRON (Netherlands Institute for Radio Astronomy) <http://www.astron.nl/>
+-- P.O.Box 2, 7990 AA Dwingeloo, The Netherlands
+--
+-- Licensed under the Apache License, Version 2.0 (the "License");
+-- you may not use this file except in compliance with the License.
+-- You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+-------------------------------------------------------------------------------
+--
+-- Author: E. Kooistra
+-- Purpose:
+-- . Test bench for multiple sdp_crosslets_remote.vhd + ring_lane.vhd +
+--   tr_10GbE in a ring
+-- Description:
+-- . https://support.astron.nl/confluence/display/L2M/L5+SDPFW+Design+Document%3A+Subband+Correlator
+--
+-- . Block diagram:
+--   * tb can use one instance of tr_10Gbe to model Rx from ring and Tx to ring.
+--   * Ring lane serial links for ring nodes RN = 0 to c_last_rn:
+--
+--     tr_10gbe_ring_serial_tx_arr --> tr_10gbe_ring_serial_rx_arr after c_cable_delay
+--
+--         /<-------------------------------------------------------------\
+--         \---> 0 ---> RN - 1  --->  RN  --->  RN + 1 ---> c_last_rn --->/
+--                                    |^
+--    tr_10gbe_ring_serial_tx_arr(RN) || tr_10gbe_ring_serial_tx_arr(RN)
+--                                    v|
+--                                  tr_10Gbe
+--                                    |^
+--      tr_10gbe_ring_rx_sosi_arr(RN) || tr_10gbe_ring_tx_sosi_arr(RN)
+--                                    v|
+--                                 ring_lane
+--                                    |^
+--               from_ri_sosi_arr(RN) || to_ri_sosi_arr(RN)
+--                                    v|
+--        local_crosslets_sosi --> sdp_crosslets_remote --> x_sosi_arr(RN)(P_sq)
+--                                                          x_sosi
+--
+--   * BSN monitors:
+--                                            RN
+--                                            |^
+--         ring_lane/ring_rx                  || ring_lane/ring_tx
+--         FPGA_xst_ring_rx_latency_R(RN)(RN) || FPGA_xst_ring_tx_latency_R(RN)(RN)
+--                                            ||
+--      dp_bsn_align_v2 P_sq inputs           ||
+--      FPGA_xst_rx_align_latency_R(RN)(P_sq) ||
+--                                            ||
+--             dp_bsn_align_v2 aligned output ||
+--             FPGA_xst_aligned_latency_R(RN) v|
+--
+-- . XST ring latency results from SDP-ARTS HW:
+--   - xst_ring_rx_latency (SDP-ARTS HW):
+--     node 64:  -1   -1   -1   -1   -1   -1   -1   -1   1774 1569 1363 1112 906  677  472  266
+--     node 65:  249  -1   -1   -1   -1   -1   -1   -1   -1   1776 1579 1352 1113 890  692  472
+--     node 66:  466  267  -1   -1   -1   -1   -1   -1   -1   -1   1787 1566 1340 1105 905  685
+--     node 67:  688  487  266  -1   -1   -1   -1   -1   -1   -1   -1   1793 1566 1346 1128 905
+--     node 68:  904  699  493  264  -1   -1   -1   -1   -1   -1   -1   -1   1788 1567 1355 1133
+--     node 69:  1114 913  717  473  252  -1   -1   -1   -1   -1   -1   -1   -1   1776 1576 1357
+--     node 70:  1341 1122 945  681  460  259  -1   -1   -1   -1   -1   -1   -1   -1   1783 1566
+--     node 71:  1551 1348 1156 890  667  471  250  -1   -1   -1   -1   -1   -1   -1   -1   1773
+--     node 72:  1785 1596 1397 1122 894  711  482  277  -1   -1   -1   -1   -1   -1   -1   -1
+--     node 73:  -1   1819 1618 1350 1114 936  693  497  254  -1   -1   -1   -1   -1   -1   -1
+--     node 74:  -1   -1   1828 1563 1342 1146 901  704  461  260  -1   -1   -1   -1   -1   -1
+--     node 75:  -1   -1   -1   1784 1564 1366 1121 920  677  480  257  -1   -1   -1   -1   -1
+--     node 76:  -1   -1   -1   -1   1804 1597 1362 1164 913  707  500  273  -1   -1   -1   -1
+--     node 77:  -1   -1   -1   -1   -1   1810 1587 1390 1125 924  723  480  261  -1   -1   -1
+--     node 78:  -1   -1   -1   -1   -1   -1   1800 1599 1351 1137 938  693  472  253  -1   -1
+--     node 79:  -1   -1   -1   -1   -1   -1   -1   1809 1566 1344 1143 899  681  460  259  -1
+--
+--   # FPGA_xst_ring_rx_latency_R (sim: c_nof_delay = 0 with sdp_crosslets_remote_v2.vhd):
+--   #    0:     -1    -1    -1    -1    -1    -1    -1    -1  1604  1409  1211  1016   818   623   427   230
+--   #    1:    230    -1    -1    -1    -1    -1    -1    -1    -1  1604  1409  1211  1016   818   623   427
+--   #    2:    427   230    -1    -1    -1    -1    -1    -1    -1    -1  1604  1409  1211  1016   818   623
+--   #    3:    623   427   230    -1    -1    -1    -1    -1    -1    -1    -1  1604  1409  1211  1016   818
+--   #    4:    818   623   427   230    -1    -1    -1    -1    -1    -1    -1    -1  1604  1409  1211  1016
+--   #    5:   1016   818   623   427   230    -1    -1    -1    -1    -1    -1    -1    -1  1604  1409  1211
+--   #    6:   1211  1016   818   623   427   230    -1    -1    -1    -1    -1    -1    -1    -1  1604  1409
+--   #    7:   1409  1211  1016   818   623   427   230    -1    -1    -1    -1    -1    -1    -1    -1  1604
+--   #    8:   1604  1409  1211  1016   818   623   427   230    -1    -1    -1    -1    -1    -1    -1    -1
+--   #    9:     -1  1604  1409  1211  1016   818   623   427   230    -1    -1    -1    -1    -1    -1    -1
+--   #   10:     -1    -1  1604  1409  1211  1016   818   623   427   230    -1    -1    -1    -1    -1    -1
+--   #   11:     -1    -1    -1  1604  1409  1211  1016   818   623   427   230    -1    -1    -1    -1    -1
+--   #   12:     -1    -1    -1    -1  1604  1409  1211  1016   818   623   427   230    -1    -1    -1    -1
+--   #   13:     -1    -1    -1    -1    -1  1604  1409  1211  1016   818   623   427   230    -1    -1    -1
+--   #   14:     -1    -1    -1    -1    -1    -1  1604  1409  1211  1016   818   623   427   230    -1    -1
+--   #   15:     -1    -1    -1    -1    -1    -1    -1  1604  1409  1211  1016   818   623   427   230    -1
+--
+--   # FPGA_xst_ring_rx_latency_R (sim: c_nof_delay = 12):
+--   #    0:     -1    -1    -1    -1    -1    -1    -1    -1  1729  1533  1332  1053   856   638   442   245
+--   #    1:    245    -1    -1    -1    -1    -1    -1    -1    -1  1729  1533  1332  1053   856   638   442
+--   #    2:    442   245    -1    -1    -1    -1    -1    -1    -1    -1  1729  1533  1332  1053   856   638
+--   #    3:    638   442   245    -1    -1    -1    -1    -1    -1    -1    -1  1729  1533  1332  1053   856
+--   #    4:    856   638   442   245    -1    -1    -1    -1    -1    -1    -1    -1  1729  1533  1332  1053
+--   #    5:   1053   856   638   442   245    -1    -1    -1    -1    -1    -1    -1    -1  1729  1533  1332
+--   #    6:   1332  1053   856   638   442   245    -1    -1    -1    -1    -1    -1    -1    -1  1729  1533
+--   #    7:   1533  1332  1053   856   638   442   245    -1    -1    -1    -1    -1    -1    -1    -1  1729
+--   #    8:   1729  1533  1332  1053   856   638   442   245    -1    -1    -1    -1    -1    -1    -1    -1
+--   #    9:     -1  1729  1533  1332  1053   856   638   442   245    -1    -1    -1    -1    -1    -1    -1
+--   #   10:     -1    -1  1729  1533  1332  1053   856   638   442   245    -1    -1    -1    -1    -1    -1
+--   #   11:     -1    -1    -1  1729  1533  1332  1053   856   638   442   245    -1    -1    -1    -1    -1
+--   #   12:     -1    -1    -1    -1  1729  1533  1332  1053   856   638   442   245    -1    -1    -1    -1
+--   #   13:     -1    -1    -1    -1    -1  1729  1533  1332  1053   856   638   442   245    -1    -1    -1
+--   #   14:     -1    -1    -1    -1    -1    -1  1729  1533  1332  1053   856   638   442   245    -1    -1
+--   #   15:     -1    -1    -1    -1    -1    -1    -1  1729  1533  1332  1053   856   638   442   245    -1
+--
+--   # FPGA_xst_ring_rx_latency_R (sim: c_nof_delay = 25):
+--   #    0:     -1    -1    -1    -1    -1    -1    -1    -1  1789  1571  1352  1135   917   698   481   262
+--   #    1:    262    -1    -1    -1    -1    -1    -1    -1    -1  1789  1571  1352  1135   917   698   481
+--   #    2:    481   262    -1    -1    -1    -1    -1    -1    -1    -1  1789  1571  1352  1135   917   698
+--   #    3:    698   481   262    -1    -1    -1    -1    -1    -1    -1    -1  1789  1571  1352  1135   917
+--   #    4:    917   698   481   262    -1    -1    -1    -1    -1    -1    -1    -1  1789  1571  1352  1135
+--   #    5:   1135   917   698   481   262    -1    -1    -1    -1    -1    -1    -1    -1  1789  1571  1352
+--   #    6:   1352  1135   917   698   481   262    -1    -1    -1    -1    -1    -1    -1    -1  1789  1571
+--   #    7:   1571  1352  1135   917   698   481   262    -1    -1    -1    -1    -1    -1    -1    -1  1789
+--   #    8:   1789  1571  1352  1135   917   698   481   262    -1    -1    -1    -1    -1    -1    -1    -1
+--   #    9:     -1  1789  1571  1352  1135   917   698   481   262    -1    -1    -1    -1    -1    -1    -1
+--   #   10:     -1    -1  1789  1571  1352  1135   917   698   481   262    -1    -1    -1    -1    -1    -1
+--   #   11:     -1    -1    -1  1789  1571  1352  1135   917   698   481   262    -1    -1    -1    -1    -1
+--   #   12:     -1    -1    -1    -1  1789  1571  1352  1135   917   698   481   262    -1    -1    -1    -1
+--   #   13:     -1    -1    -1    -1    -1  1789  1571  1352  1135   917   698   481   262    -1    -1    -1
+--   #   14:     -1    -1    -1    -1    -1    -1  1789  1571  1352  1135   917   698   481   262    -1    -1
+--   #   15:     -1    -1    -1    -1    -1    -1    -1  1789  1571  1352  1135   917   698   481   262    -1
+--
+--   - xst_ring_tx_latency (SDP-ARTS HW):
+--     node 64:  12   -1   -1   -1   -1   -1   -1   -1   -1   1611 1361 1155 926  698  470  264
+--     node 65:  256  12   -1   -1   -1   -1   -1   -1   -1   -1   1583 1363 1143 920  676  476
+--     node 66:  470  272  12   -1   -1   -1   -1   -1   -1   -1   -1   1575 1353 1131 892  692
+--     node 67:  694  496  274  12   -1   -1   -1   -1   -1   -1   -1   -1   1577 1357 1119 914
+--     node 68:  922  714  486  258  12   -1   -1   -1   -1   -1   -1   -1   -1   1585 1347 1125
+--     node 69:  1145 926  704  482  260  12   -1   -1   -1   -1   -1   -1   -1   -1   1567 1345
+--     node 70:  1371 1147 924  704  484  264  12   -1   -1   -1   -1   -1   -1   -1   -1   1567
+--     node 71:  1579 1359 1139 916  696  476  256  12   -1   -1   -1   -1   -1   -1   -1   -1
+--     node 72:  -1   1597 1369 1141 934  706  500  274  12   -1   -1   -1   -1   -1   -1   -1
+--     node 73:  -1   -1   1593 1347 1149 930  706  484  262  12   -1   -1   -1   -1   -1   -1
+--     node 74:  -1   -1   -1   1571 1373 1151 928  708  488  264  12   -1   -1   -1   -1   -1
+--     node 75:  -1   -1   -1   -1   1595 1373 1151 928  708  488  264  12   -1   -1   -1   -1
+--     node 76:  -1   -1   -1   -1   -1   1619 1391 1163 934  728  500  294  12   -1   -1   -1
+--     node 77:  -1   -1   -1   -1   -1   -1   1611 1369 1145 946  706  508  264  12   -1   -1
+--     node 78:  -1   -1   -1   -1   -1   -1   -1   1593 1371 1171 928  732  488  268  12   -1
+--     node 79:  -1   -1   -1   -1   -1   -1   -1   -1   1587 1387 1143 948  702  480  262  12
+--
+--   # FPGA_xst_ring_tx_latency_R (sim: c_nof_delay = 0 with sdp_crosslets_remote_v2.vhd):
+--   #    0:     13    -1    -1    -1    -1    -1    -1    -1    -1  1415  1217  1023   824   629   434   237
+--   #    1:    237    13    -1    -1    -1    -1    -1    -1    -1    -1  1415  1217  1023   824   629   434
+--   #    2:    434   237    13    -1    -1    -1    -1    -1    -1    -1    -1  1415  1217  1023   824   629
+--   #    3:    629   434   237    13    -1    -1    -1    -1    -1    -1    -1    -1  1415  1217  1023   824
+--   #    4:    824   629   434   237    13    -1    -1    -1    -1    -1    -1    -1    -1  1415  1217  1023
+--   #    5:   1023   824   629   434   237    13    -1    -1    -1    -1    -1    -1    -1    -1  1415  1217
+--   #    6:   1217  1023   824   629   434   237    13    -1    -1    -1    -1    -1    -1    -1    -1  1415
+--   #    7:   1415  1217  1023   824   629   434   237    13    -1    -1    -1    -1    -1    -1    -1    -1
+--   #    8:     -1  1415  1217  1023   824   629   434   237    13    -1    -1    -1    -1    -1    -1    -1
+--   #    9:     -1    -1  1415  1217  1023   824   629   434   237    13    -1    -1    -1    -1    -1    -1
+--   #   10:     -1    -1    -1  1415  1217  1023   824   629   434   237    13    -1    -1    -1    -1    -1
+--   #   11:     -1    -1    -1    -1  1415  1217  1023   824   629   434   237    13    -1    -1    -1    -1
+--   #   12:     -1    -1    -1    -1    -1  1415  1217  1023   824   629   434   237    13    -1    -1    -1
+--   #   13:     -1    -1    -1    -1    -1    -1  1415  1217  1023   824   629   434   237    13    -1    -1
+--   #   14:     -1    -1    -1    -1    -1    -1    -1  1415  1217  1023   824   629   434   237    13    -1
+--   #   15:     -1    -1    -1    -1    -1    -1    -1    -1  1415  1217  1023   824   629   434   237    13
+--
+--   # FPGA_xst_ring_tx_latency_R (sim: c_nof_delay = 12):
+--   #    0:     12    -1    -1    -1    -1    -1    -1    -1    -1  1539  1339  1119   862   645   448   251
+--   #    1:    251    12    -1    -1    -1    -1    -1    -1    -1    -1  1539  1339  1119   862   645   448
+--   #    2:    448   251    12    -1    -1    -1    -1    -1    -1    -1    -1  1539  1339  1119   862   645
+--   #    3:    645   448   251    12    -1    -1    -1    -1    -1    -1    -1    -1  1539  1339  1119   862
+--   #    4:    862   645   448   251    12    -1    -1    -1    -1    -1    -1    -1    -1  1539  1339  1119
+--   #    5:   1119   862   645   448   251    12    -1    -1    -1    -1    -1    -1    -1    -1  1539  1339
+--   #    6:   1339  1119   862   645   448   251    12    -1    -1    -1    -1    -1    -1    -1    -1  1539
+--   #    7:   1539  1339  1119   862   645   448   251    12    -1    -1    -1    -1    -1    -1    -1    -1
+--   #    8:     -1  1539  1339  1119   862   645   448   251    12    -1    -1    -1    -1    -1    -1    -1
+--   #    9:     -1    -1  1539  1339  1119   862   645   448   251    12    -1    -1    -1    -1    -1    -1
+--   #   10:     -1    -1    -1  1539  1339  1119   862   645   448   251    12    -1    -1    -1    -1    -1
+--   #   11:     -1    -1    -1    -1  1539  1339  1119   862   645   448   251    12    -1    -1    -1    -1
+--   #   12:     -1    -1    -1    -1    -1  1539  1339  1119   862   645   448   251    12    -1    -1    -1
+--   #   13:     -1    -1    -1    -1    -1    -1  1539  1339  1119   862   645   448   251    12    -1    -1
+--   #   14:     -1    -1    -1    -1    -1    -1    -1  1539  1339  1119   862   645   448   251    12    -1
+--   #   15:     -1    -1    -1    -1    -1    -1    -1    -1  1539  1339  1119   862   645   448   251    12
+--
+--   # FPGA_xst_ring_tx_latency_R (sim: c_nof_delay = 25):
+--   #    0:     12    -1    -1    -1    -1    -1    -1    -1    -1  1577  1359  1141   924   705   488   269
+--   #    1:    269    12    -1    -1    -1    -1    -1    -1    -1    -1  1577  1359  1141   924   705   488
+--   #    2:    488   269    12    -1    -1    -1    -1    -1    -1    -1    -1  1577  1359  1141   924   705
+--   #    3:    705   488   269    12    -1    -1    -1    -1    -1    -1    -1    -1  1577  1359  1141   924
+--   #    4:    924   705   488   269    12    -1    -1    -1    -1    -1    -1    -1    -1  1577  1359  1141
+--   #    5:   1141   924   705   488   269    12    -1    -1    -1    -1    -1    -1    -1    -1  1577  1359
+--   #    6:   1359  1141   924   705   488   269    12    -1    -1    -1    -1    -1    -1    -1    -1  1577
+--   #    7:   1577  1359  1141   924   705   488   269    12    -1    -1    -1    -1    -1    -1    -1    -1
+--   #    8:     -1  1577  1359  1141   924   705   488   269    12    -1    -1    -1    -1    -1    -1    -1
+--   #    9:     -1    -1  1577  1359  1141   924   705   488   269    12    -1    -1    -1    -1    -1    -1
+--   #   10:     -1    -1    -1  1577  1359  1141   924   705   488   269    12    -1    -1    -1    -1    -1
+--   #   11:     -1    -1    -1    -1  1577  1359  1141   924   705   488   269    12    -1    -1    -1    -1
+--   #   12:     -1    -1    -1    -1    -1  1577  1359  1141   924   705   488   269    12    -1    -1    -1
+--   #   13:     -1    -1    -1    -1    -1    -1  1577  1359  1141   924   705   488   269    12    -1    -1
+--   #   14:     -1    -1    -1    -1    -1    -1    -1  1577  1359  1141   924   705   488   269    12    -1
+--   #   15:     -1    -1    -1    -1    -1    -1    -1    -1  1577  1359  1141   924   705   488   269    12
+--
+--   - xst_rx_align_latency (SDP-ARTS HW):
+--     node 64:  1    204  434  638  868  1109 1318 1546 1774
+--     node 65:  1    214  412  652  852  1109 1315 1532 1756
+--     node 66:  1    202  422  622  866  1109 1326 1529 1750
+--     node 67:  1    204  426  648  846  1109 1324 1548 1751
+--     node 68:  1    210  416  644  874  1109 1322 1528 1758
+--     node 69:  1    204  426  626  848  1109 1328 1546 1746
+--     node 70:  1    208  428  648  848  1109 1330 1550 1753
+--     node 71:  1    210  430  648  870  1109 1328 1552 1770
+--     node 72:  1    230  436  666  892  1109 1342 1570 1776
+--     node 73:  1    202  444  640  884  1109 1327 1566 1788
+--     node 74:  1    222  422  664  860  1109 1323 1543 1784
+--     node 75:  1    214  432  634  878  1109 1319 1541 1763
+--     node 76:  1    232  438  668  872  1125 1346 1559 1789
+--     node 77:  1    216  436  654  876  1109 1337 1554 1777
+--     node 78:  1    206  430  648  868  1109 1332 1559 1772
+--     node 79:  1    208  430  650  870  1109 1328 1550 1775
+--
+--   # FPGA_xst_rx_align_latency_R (sim: c_nof_delay = 0 with sdp_crosslets_remote_v2.vhd):
+--   #    0:      1   235   432   628   823  1021  1216  1414  1609
+--   #    1:      1   235   432   628   823  1021  1216  1414  1609
+--   #    2:      1   235   432   628   823  1021  1216  1414  1609
+--   #    3:      1   235   432   628   823  1021  1216  1414  1609
+--   #    4:      1   235   432   628   823  1021  1216  1414  1609
+--   #    5:      1   235   432   628   823  1021  1216  1414  1609
+--   #    6:      1   235   432   628   823  1021  1216  1414  1609
+--   #    7:      1   235   432   628   823  1021  1216  1414  1609
+--   #    8:      1   235   432   628   823  1021  1216  1414  1609
+--   #    9:      1   235   432   628   823  1021  1216  1414  1609
+--   #   10:      1   235   432   628   823  1021  1216  1414  1609
+--   #   11:      1   235   432   628   823  1021  1216  1414  1609
+--   #   12:      1   235   432   628   823  1021  1216  1414  1609
+--   #   13:      1   235   432   628   823  1021  1216  1414  1609
+--   #   14:      1   235   432   628   823  1021  1216  1414  1609
+--   #   15:      1   235   432   628   823  1021  1216  1414  1609
+--
+--   # FPGA_xst_rx_align_latency_R (sim: c_nof_delay = 12):
+--   #    0:      1   199   396   593   810  1109  1308  1506  1702
+--   #    1:      1   199   396   593   810  1109  1308  1506  1702
+--   #    2:      1   199   396   593   810  1109  1308  1506  1702
+--   #    3:      1   199   396   593   810  1109  1308  1506  1702
+--   #    4:      1   199   396   593   810  1109  1308  1506  1702
+--   #    5:      1   199   396   593   810  1109  1308  1506  1702
+--   #    6:      1   199   396   593   810  1109  1308  1506  1702
+--   #    7:      1   199   396   593   810  1109  1308  1506  1702
+--   #    8:      1   199   396   593   810  1109  1308  1506  1702
+--   #    9:      1   199   396   593   810  1109  1308  1506  1702
+--   #   10:      1   199   396   593   810  1109  1308  1506  1702
+--   #   11:      1   199   396   593   810  1109  1308  1506  1702
+--   #   12:      1   199   396   593   810  1109  1308  1506  1702
+--   #   13:      1   199   396   593   810  1109  1308  1506  1702
+--   #   14:      1   199   396   593   810  1109  1308  1506  1702
+--   #   15:      1   199   396   593   810  1109  1308  1506  1702
+--
+--   # FPGA_xst_rx_align_latency_R (sim: c_nof_delay = 25):
+--   #    0:      1   217   436   653   872  1109  1326  1544  1762
+--   #    1:      1   217   436   653   872  1109  1326  1544  1762
+--   #    2:      1   217   436   653   872  1109  1326  1544  1762
+--   #    3:      1   217   436   653   872  1109  1326  1544  1762
+--   #    4:      1   217   436   653   872  1109  1326  1544  1762
+--   #    5:      1   217   436   653   872  1109  1326  1544  1762
+--   #    6:      1   217   436   653   872  1109  1326  1544  1762
+--   #    7:      1   217   436   653   872  1109  1326  1544  1762
+--   #    8:      1   217   436   653   872  1109  1326  1544  1762
+--   #    9:      1   217   436   653   872  1109  1326  1544  1762
+--   #   10:      1   217   436   653   872  1109  1326  1544  1762
+--   #   11:      1   217   436   653   872  1109  1326  1544  1762
+--   #   12:      1   217   436   653   872  1109  1326  1544  1762
+--   #   13:      1   217   436   653   872  1109  1326  1544  1762
+--   #   14:      1   217   436   653   872  1109  1326  1544  1762
+--   #   15:      1   217   436   653   872  1109  1326  1544  1762
+--
+--   - xst_aligned_latency (SDP-ARTS HW):    # FPGA_xst_aligned_latency_R (sim: c_nof_delay = 0, 12, 25):
+--     node 64:  2051                        # 0: 2051
+--     node 65:  2051                        # 1: 2051
+--     node 66:  2051                        # 2: 2051
+--     node 67:  2051                        # 3: 2051
+--     node 68:  2051                        # 4: 2051
+--     node 69:  2051                        # 5: 2051
+--     node 70:  2051                        # 6: 2051
+--     node 71:  2051                        # 7: 2051
+--     node 72:  2051                        # 8: 2051
+--     node 73:  2051                        # 9: 2051
+--     node 74:  2051                        # 10: 2051
+--     node 75:  2051                        # 11: 2051
+--     node 76:  2051                        # 12: 2051
+--     node 77:  2051                        # 13: 2051
+--     node 78:  2051                        # 14: 2051
+--     node 79:  2051                        # 15: 2051
+--
+-- Usage:
+-- > as 3 or more
+-- > add wave -position insertpoint sim:/tb_sdp_crosslets_remote_ring/x_sosi_2arr
+-- > run -a
+-------------------------------------------------------------------------------
+
+library IEEE, common_lib, dp_lib, st_lib, ring_lib, tr_10GbE_lib, tech_pll_lib;
+use IEEE.std_logic_1164.all;
+use common_lib.common_pkg.all;
+use common_lib.common_mem_pkg.all;
+use common_lib.tb_common_pkg.all;
+use common_lib.tb_common_mem_pkg.all;
+use common_lib.common_str_pkg.all;
+use dp_lib.dp_stream_pkg.all;
+use ring_lib.ring_pkg.all;
+use tech_pll_lib.tech_pll_component_pkg.all;
+use work.sdp_pkg.all;
+use work.tb_sdp_pkg.all;
+
+entity tb_sdp_crosslets_remote_ring is
+  generic (
+    g_nof_rn    : natural := 4;  -- number of nodes in the ring
+    g_nof_sync  : natural := 2
+  );
+end tb_sdp_crosslets_remote_ring;
+
+architecture tb of tb_sdp_crosslets_remote_ring is
+  constant c_dp_clk_period : time := 5 ns;  -- 200 MHz
+  constant c_mm_clk_period : time := 1 ns;  -- fast MM clk to speed up simulation
+  constant c_sa_clk_period : time := tech_pll_clk_644_period;  -- 644MHz
+
+  -- Apply cable delay in tech_pll_clk_156_period units, to remain aligned with tr_10GbE sim model
+  -- . Choose c_cable_delay = 16 * 6.4 ~= 102 ns ~= 20 dp_clk of 5 ns, to match delay seen on HW
+  -- . Minimum c_cable_delay >= 12 * 6.4 = 77 ns ~= 15 dp_clk of 5 ns, else missed blocks in x_sosi
+  --   This minimum occurs when g_nof_rn > 8 and was found with g_nof_rn = 16. It happens due to
+  --   that the local crosslets are passed through ring_mux and dp_demux. This causes that the
+  --   block period of the local crosslets can vary and the there is not enough time to read all
+  --   aligned croslets. Therefore instead use sdp_crosslets_remote_v2.vhd.
+  -- . Maximum c_cable_delay <= 29 * 6.4 = 185 ns ~= 37 dp_clk of 5 ns, else missed blocks in x_sosi
+  constant c_clk_156_period  : time := tech_pll_clk_156_period;  -- 6.400020 ns ~= 156.25 MHz
+  constant c_nof_delay       : natural := 0;
+  constant c_cable_delay     : time := c_clk_156_period * c_nof_delay;
+
+  -- XST data
+  constant c_P_sq                      : natural := g_nof_rn / 2 + 1;  -- nof square correlator cells
+  constant c_nof_transport_hops        : natural := c_P_sq - 1;
+  constant c_block_period              : natural := c_sdp_N_fft;
+  constant c_block_size                : natural := c_sdp_N_crosslets_max * c_sdp_S_pn;
+  constant c_gap_size                  : natural := c_block_period - c_block_size;
+  constant c_nof_blocks_per_sync       : natural := 10;
+  constant c_local_crosslet_re         : integer := 1;
+  constant c_local_crosslet_im         : integer := 2;
+
+  constant c_last_rn                   : natural := g_nof_rn - 1;  -- first ring node has index RN = 0 by definition.
+
+  type t_ring_integer_2arr is array (integer range <>) of t_integer_arr(c_last_rn downto 0);
+
+  type t_crosslets_cipo_2arr is array (integer range <>) of t_mem_cipo_arr(c_P_sq - 1 downto 0);
+  type t_crosslets_sosi_2arr is array (integer range <>) of t_dp_sosi_arr(c_P_sq - 1 downto 0);
+  type t_crosslets_integer_2arr is array (integer range <>) of t_integer_arr(c_P_sq - 1 downto 0);
+
+  -- Ring lane packets
+  constant c_use_cable                 : std_logic := '1';  -- '0' ring via PCB traces, '1' ring via QSFP cables
+
+  -- = crosslet subband select block size divided by 2 as it is repacked from 32b to 64b. = 42 longwords
+  constant c_lane_payload_nof_longwords_xst : natural := c_sdp_N_crosslets_max * c_sdp_S_pn / 2;
+  constant c_lane_packet_nof_longwords_max  : natural := c_lane_payload_nof_longwords_xst + c_ring_dp_hdr_field_size;
+                                                         -- = 54 + 3 = 57
+  constant c_fifo_tx_fill_margin       : natural := 10;  -- >= c_fifo_fill_margin = 6 that is used in dp_fifo_fill_eop
+  constant c_fifo_tx_size_ring : natural := true_log_pow2(c_lane_packet_nof_longwords_max * 2 + c_fifo_tx_fill_margin);
+                                            -- = 552 + 6 --> 1024
+  constant c_fifo_tx_fill_ring : natural := c_fifo_tx_size_ring - c_fifo_tx_fill_margin;
+                                            -- = maximum fill level, so rely on eop
+  constant c_err_bi                    : natural := 0;
+  constant c_nof_err_counts            : natural := 8;
+  constant c_bsn_at_sync_check_channel : natural := 1;
+  constant c_validate_channel          : boolean := true;
+  constant c_validate_channel_mode     : string  := "=";
+  constant c_sync_timeout              : natural := c_block_period * (c_nof_blocks_per_sync + 1);
+
+  -- Timeout tb if there is no output x_sosi
+  constant c_tb_timeout                : time := (g_nof_sync + 1) * c_sync_timeout * c_dp_clk_period;
+
+  -- Address widths of a single MM instance
+  constant c_addr_w_reg_ring_lane_info_xst          : natural := 1;
+
+  signal mm_init                : std_logic := '1';
+  signal tb_end                 : std_logic := '0';
+  signal dp_clk                 : std_logic := '1';
+  signal dp_rst                 : std_logic;
+  signal mm_clk                 : std_logic := '1';
+  signal mm_rst                 : std_logic;
+  signal SA_CLK                 : std_logic := '1';
+  signal tr_ref_clk_312         : std_logic := '0';
+  signal tr_ref_clk_156         : std_logic := '0';
+  signal tr_ref_rst_156         : std_logic := '0';
+
+  signal stimuli_rst              : std_logic;
+  signal stimuli_end              : std_logic;
+
+  signal stimuli_sosi             : t_dp_sosi;
+  signal local_crosslets_sosi_arr : t_dp_sosi_arr(c_last_rn downto 0);
+  signal xst_bs_sosi              : t_dp_sosi;
+  signal from_ri_sosi_arr         : t_dp_sosi_arr(c_last_rn downto 0);
+  signal to_ri_sosi_arr           : t_dp_sosi_arr(c_last_rn downto 0);
+  signal crosslets_sosi_arr       : t_dp_sosi_arr(c_last_rn downto 0);
+  signal crosslets_copi_arr       : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal crosslets_cipo_2arr      : t_crosslets_cipo_2arr(c_last_rn downto 0);
+  signal x_sosi_2arr              : t_crosslets_sosi_2arr(c_last_rn downto 0);
+  signal x_sosi_2arr_valids       : std_logic_vector(g_nof_rn * c_P_sq - 1 downto 0);
+  signal x_sosi_arr               : t_dp_sosi_arr(c_last_rn downto 0);
+  signal x_sosi                   : t_dp_sosi;
+
+  -- 10GbE ring
+  signal tr_10gbe_ring_rx_sosi_arr    : t_dp_sosi_arr(c_last_rn downto 0) := (others => c_dp_sosi_rst);
+  signal tr_10gbe_ring_tx_sosi_arr    : t_dp_sosi_arr(c_last_rn downto 0) := (others => c_dp_sosi_rst);
+  signal tr_10gbe_ring_serial_rx_arr  : std_logic_vector(c_last_rn downto 0) := (others => '0');
+  signal tr_10gbe_ring_serial_tx_arr  : std_logic_vector(c_last_rn downto 0) := (others => '0');
+
+  -- Crosslets ring MM registers
+  signal reg_ring_lane_info_xst_copi_arr         : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_ring_lane_info_xst_cipo_arr         : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_ring_lane_info_xst_copi             : t_mem_copi := c_mem_copi_rst;
+  signal reg_ring_lane_info_xst_cipo             : t_mem_cipo := c_mem_cipo_rst;
+  signal reg_bsn_monitor_v2_ring_rx_xst_copi_arr : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_bsn_monitor_v2_ring_rx_xst_cipo_arr : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_bsn_monitor_v2_ring_rx_xst_copi     : t_mem_copi := c_mem_copi_rst;
+  signal reg_bsn_monitor_v2_ring_rx_xst_cipo     : t_mem_cipo := c_mem_cipo_rst;
+  signal reg_bsn_monitor_v2_ring_tx_xst_copi_arr : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_bsn_monitor_v2_ring_tx_xst_cipo_arr : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_bsn_monitor_v2_ring_tx_xst_copi     : t_mem_copi := c_mem_copi_rst;
+  signal reg_bsn_monitor_v2_ring_tx_xst_cipo     : t_mem_cipo := c_mem_cipo_rst;
+  signal reg_dp_block_validate_err_xst_copi_arr  : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_dp_block_validate_err_xst_cipo_arr  : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_dp_block_validate_err_xst_copi      : t_mem_copi := c_mem_copi_rst;
+  signal reg_dp_block_validate_err_xst_cipo      : t_mem_cipo := c_mem_cipo_rst;
+  signal reg_dp_block_validate_bsn_at_sync_xst_copi_arr : t_mem_copi_arr(c_last_rn downto 0) :=
+                                                         (others => c_mem_copi_rst);
+  signal reg_dp_block_validate_bsn_at_sync_xst_cipo_arr : t_mem_cipo_arr(c_last_rn downto 0) :=
+                                                         (others => c_mem_cipo_rst);
+  signal reg_dp_block_validate_bsn_at_sync_xst_copi     : t_mem_copi := c_mem_copi_rst;
+  signal reg_dp_block_validate_bsn_at_sync_xst_cipo     : t_mem_cipo := c_mem_cipo_rst;
+  -- Crosslets ring MM points
+  signal FPGA_xst_ring_nof_transport_hops_R       : t_natural_arr(c_last_rn downto 0);
+  signal FPGA_xst_ring_rx_latency_R               : t_ring_integer_2arr(c_last_rn downto 0);
+  signal FPGA_xst_ring_tx_latency_R               : t_ring_integer_2arr(c_last_rn downto 0);
+
+  -- BSN aligner MM registers
+  signal reg_bsn_align_v2_xst_copi_arr            : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_bsn_align_v2_xst_cipo_arr            : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_bsn_align_v2_xst_copi                : t_mem_copi := c_mem_copi_rst;
+  signal reg_bsn_align_v2_xst_cipo                : t_mem_cipo := c_mem_cipo_rst;
+  signal reg_bsn_monitor_v2_xst_rx_align_copi_arr : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_bsn_monitor_v2_xst_rx_align_cipo_arr : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_bsn_monitor_v2_xst_rx_align_copi     : t_mem_copi := c_mem_copi_rst;
+  signal reg_bsn_monitor_v2_xst_rx_align_cipo     : t_mem_cipo := c_mem_cipo_rst;
+  signal reg_bsn_monitor_v2_xst_aligned_copi_arr  : t_mem_copi_arr(c_last_rn downto 0) := (others => c_mem_copi_rst);
+  signal reg_bsn_monitor_v2_xst_aligned_cipo_arr  : t_mem_cipo_arr(c_last_rn downto 0) := (others => c_mem_cipo_rst);
+  signal reg_bsn_monitor_v2_xst_aligned_copi      : t_mem_copi := c_mem_copi_rst;
+  signal reg_bsn_monitor_v2_xst_aligned_cipo      : t_mem_cipo := c_mem_cipo_rst;
+  -- BSN aligner Monitor Points
+  signal FPGA_xst_rx_align_latency_R              : t_crosslets_integer_2arr(c_last_rn downto 0);
+  signal FPGA_xst_aligned_latency_R               : t_integer_arr(c_last_rn downto 0);
+begin
+  dp_rst <= '1', '0' after c_dp_clk_period * 7;
+  dp_clk <= (not dp_clk) or tb_end after c_dp_clk_period / 2;
+
+  mm_rst <= '1', '0' after c_mm_clk_period * 7;
+  mm_clk <= (not mm_clk) or tb_end after c_mm_clk_period / 2;
+
+  -- Wait for tr_10GbE to be active
+  stimuli_rst <= '1', '0' after 15 us;
+
+  SA_CLK <= not SA_CLK after c_sa_clk_period / 2;  -- Serial Gigabit IO sa clock (644 MHz)
+
+  -- Generate local crosslets stream, use same for all nodes
+  u_stimuli : entity dp_lib.dp_stream_stimuli
+  generic map (
+    g_sync_period => c_nof_blocks_per_sync,
+    g_nof_repeat  => c_nof_blocks_per_sync * g_nof_sync,
+    g_pkt_len     => c_block_size,
+    g_pkt_gap     => c_gap_size
+  )
+  port map (
+    rst               => stimuli_rst,
+    clk               => dp_clk,
+    -- Generate stimuli
+    src_out           => stimuli_sosi,
+    -- End of stimuli
+    tb_end            => stimuli_end
+  );
+
+  -- Use constant crosslet data to ease verification of aligned crosslet data at each node
+  p_local_crosslets_sosi : process(stimuli_sosi)
+  begin
+    for RN in 0 to c_last_rn loop
+      local_crosslets_sosi_arr(RN) <= stimuli_sosi;
+      local_crosslets_sosi_arr(RN).data <= TO_DP_SDATA(0);
+      -- different crosslets value (and /= 0) per node
+      local_crosslets_sosi_arr(RN).re <= TO_DP_DSP_DATA(RN * c_nof_complex + c_local_crosslet_re);  -- odd
+      local_crosslets_sosi_arr(RN).im <= TO_DP_DSP_DATA(RN * c_nof_complex + c_local_crosslet_im);  -- even
+      local_crosslets_sosi_arr(RN).channel <= TO_DP_CHANNEL(0);
+      local_crosslets_sosi_arr(RN).err <= TO_DP_ERROR(0);
+    end loop;
+  end process;
+
+  xst_bs_sosi <= local_crosslets_sosi_arr(0);
+
+  p_mm : process
+    -- MM access
+    variable v_span          : natural;
+    variable v_span_node     : natural;
+    variable v_offset        : natural;
+    -- print_str()
+    constant c_nof_col       : natural := 1 + g_nof_rn;
+    constant c_col_w         : natural := 6;
+    constant c_line_w        : natural := c_nof_col * c_col_w;
+    variable v_line          : string(1 to c_line_w);
+    variable v_col           : natural;
+  begin
+    proc_common_wait_until_low(dp_clk, mm_rst);
+    proc_common_wait_some_cycles(mm_clk, 10);
+
+    proc_common_wait_cross_clock_domain_latency(c_mm_clk_period, c_dp_clk_period,
+                                                c_common_cross_clock_domain_latency * 2);
+    ---------------------------------------------------------------------------
+    -- Setup transport nof hops for each RN to c_nof_transport_hops
+    ---------------------------------------------------------------------------
+    -- Write FPGA_xst_ring_nof_transport_hops_RW = ring_lane_info.transport_nof_hops
+    v_span := 2**c_addr_w_reg_ring_lane_info_xst;
+    for RN in 0 to c_last_rn loop
+      v_offset := 1 + RN * v_span;
+      proc_mem_mm_bus_wr(v_offset, c_nof_transport_hops, mm_clk,
+                         reg_ring_lane_info_xst_cipo, reg_ring_lane_info_xst_copi);
+    end loop;
+    proc_common_wait_cross_clock_domain_latency(c_mm_clk_period, c_dp_clk_period,
+                                                c_common_cross_clock_domain_latency * 2);
+    -- Readback FPGA_xst_ring_nof_transport_hops_R
+    for RN in 0 to c_last_rn loop
+      v_offset := 1 + RN * v_span;
+      proc_mem_mm_bus_rd(v_offset, mm_clk, reg_ring_lane_info_xst_cipo, reg_ring_lane_info_xst_copi);
+      proc_mem_mm_bus_rd_latency(1, mm_clk);
+      FPGA_xst_ring_nof_transport_hops_R(RN) <= TO_UINT(reg_ring_lane_info_xst_cipo.rddata(c_word_w - 1 downto 0));
+    end loop;
+
+    ---------------------------------------------------------------------------
+    -- Wait until second x_sosi.sync
+    ---------------------------------------------------------------------------
+    proc_common_wait_until_hi_lo(dp_clk, x_sosi.sync);
+    proc_common_wait_until_hi_lo(dp_clk, x_sosi.sync);
+
+    ---------------------------------------------------------------------------
+    -- Read BSN monitors
+    ---------------------------------------------------------------------------
+    v_span := 2**c_sdp_reg_bsn_monitor_v2_addr_w;
+    -- Read FPGA_xst_ring_rx_latency_R
+    v_span_node := true_log_pow2(g_nof_rn) * v_span;
+    for RN in 0 to c_last_rn loop
+      for U in 0 to c_last_rn loop
+        v_offset := 6 + RN * v_span_node + U * v_span;
+        proc_mem_mm_bus_rd(v_offset, mm_clk, reg_bsn_monitor_v2_ring_rx_xst_cipo, reg_bsn_monitor_v2_ring_rx_xst_copi);
+        proc_mem_mm_bus_rd_latency(1, mm_clk);
+        FPGA_xst_ring_rx_latency_R(RN)(U) <= TO_SINT(reg_bsn_monitor_v2_ring_rx_xst_cipo.rddata(c_word_w - 1 downto 0));
+      end loop;
+    end loop;
+    -- Read FPGA_xst_rx_align_latency_R, for c_P_sq inputs per RN
+    v_span_node := true_log_pow2(c_P_sq) * v_span;
+    for RN in 0 to c_last_rn loop
+      for P in 0 to c_P_sq - 1 loop
+        v_offset := 6 + RN * v_span_node + P * v_span;
+        proc_mem_mm_bus_rd(v_offset, mm_clk, reg_bsn_monitor_v2_xst_rx_align_cipo, reg_bsn_monitor_v2_xst_rx_align_copi);
+        proc_mem_mm_bus_rd_latency(1, mm_clk);
+        FPGA_xst_rx_align_latency_R(RN)(P) <= TO_SINT(reg_bsn_monitor_v2_xst_rx_align_cipo.rddata(c_word_w - 1 downto 0));
+      end loop;
+    end loop;
+    -- Read FPGA_xst_aligned_latency_R
+    for RN in 0 to c_last_rn loop
+      v_offset := 6 + RN * v_span;
+      proc_mem_mm_bus_rd(v_offset, mm_clk, reg_bsn_monitor_v2_xst_aligned_cipo, reg_bsn_monitor_v2_xst_aligned_copi);
+      proc_mem_mm_bus_rd_latency(1, mm_clk);
+      FPGA_xst_aligned_latency_R(RN) <= TO_SINT(reg_bsn_monitor_v2_xst_aligned_cipo.rddata(c_word_w - 1 downto 0));
+    end loop;
+    -- Read FPGA_xst_ring_tx_latency_R
+    v_span_node := true_log_pow2(g_nof_rn) * v_span;
+    for RN in 0 to c_last_rn loop
+      for U in 0 to c_last_rn loop
+        v_offset := 6 + RN * v_span_node + U * v_span;
+        proc_mem_mm_bus_rd(v_offset, mm_clk, reg_bsn_monitor_v2_ring_tx_xst_cipo, reg_bsn_monitor_v2_ring_tx_xst_copi);
+        proc_mem_mm_bus_rd_latency(1, mm_clk);
+        FPGA_xst_ring_tx_latency_R(RN)(U) <= TO_SINT(reg_bsn_monitor_v2_ring_tx_xst_cipo.rddata(c_word_w - 1 downto 0));
+      end loop;
+    end loop;
+
+    ---------------------------------------------------------------------------
+    -- Wait until end of simulation
+    ---------------------------------------------------------------------------
+    mm_init <= '0';
+
+    proc_common_wait_until_high(dp_clk, stimuli_end);
+    proc_common_wait_some_cycles(dp_clk, 1000);
+
+    ---------------------------------------------------------------------------
+    -- Print latency results
+    ---------------------------------------------------------------------------
+    print_str("c_cable_delay = " & int_to_str(c_nof_delay) & " * 6.4 ns");
+    print_str("");
+    print_str("FPGA_xst_ring_rx_latency_R:");
+    for RN in 0 to c_last_rn loop
+      v_line := (others => ' ');
+      -- ring node index
+      v_line(1 to c_col_w - 2) := int_to_str(RN, c_col_w - 2);
+      v_line(c_col_w - 1) := ':';
+      -- latency values
+      for U in 0 to c_last_rn loop
+         v_col := 1 + U;
+         v_line(1 + v_col * c_col_w to (v_col + 1) * c_col_w) :=
+                int_to_str(FPGA_xst_ring_rx_latency_R(RN)(U), c_col_w);
+      end loop;
+      print_str(v_line);
+    end Loop;
+    print_str("");
+
+    print_str("FPGA_xst_ring_tx_latency_R:");
+    for RN in 0 to c_last_rn loop
+      v_line := (others => ' ');
+      -- ring node index
+      v_line(1 to c_col_w - 2) := int_to_str(RN, c_col_w - 2);
+      v_line(c_col_w - 1) := ':';
+      -- latency values
+      for U in 0 to c_last_rn loop
+         v_col := 1 + U;
+         v_line(1 + v_col * c_col_w to (v_col + 1) * c_col_w) :=
+                int_to_str(FPGA_xst_ring_tx_latency_R(RN)(U), c_col_w);
+      end loop;
+      print_str(v_line);
+    end Loop;
+    print_str("");
+
+    print_str("FPGA_xst_rx_align_latency_R:");
+    for RN in 0 to c_last_rn loop
+      v_line := (others => ' ');
+      -- ring node index
+      v_line(1 to c_col_w - 2) := int_to_str(RN, c_col_w - 2);
+      v_line(c_col_w - 1) := ':';
+      -- latency values
+      for U in 0 to c_P_sq - 1 loop
+         v_col := 1 + U;
+         v_line(1 + v_col * c_col_w to (v_col + 1) * c_col_w) :=
+                int_to_str(FPGA_xst_rx_align_latency_R(RN)(U), c_col_w);
+      end loop;
+      print_str(v_line);
+    end Loop;
+    print_str("");
+
+    print_str("FPGA_xst_aligned_latency_R:");
+    for RN in 0 to c_last_rn loop
+      print_str(int_to_str(RN) & ": " & int_to_str(FPGA_xst_aligned_latency_R(RN)));
+    end Loop;
+    print_str("");
+
+    tb_end <= '1';
+    wait;
+  end process;
+
+  -- End the tb simulation
+  proc_common_timeout_failure(c_tb_timeout, tb_end);  -- ERROR: end simulation if it fails to end in time
+  proc_common_stop_simulation(tb_end);  -- OK: end simulation
+
+  ------------------------------------------------------------------------------
+  -- DUT
+  ------------------------------------------------------------------------------
+  gen_dut : for RN in 0 to c_last_rn generate
+    -- Connect ring wires between the nodes
+    wire_ring : if RN > 0 generate
+      tr_10gbe_ring_serial_rx_arr(RN) <= transport tr_10gbe_ring_serial_tx_arr(RN - 1) after c_cable_delay;
+    end generate;
+    close_ring : if RN = 0 generate
+      tr_10gbe_ring_serial_rx_arr(0) <= transport tr_10gbe_ring_serial_tx_arr(c_last_rn) after c_cable_delay;
+    end generate;
+
+    -- tr_10GbE access at each node, all via front_io QSFP[0]
+    u_tr_10GbE_ring: entity tr_10GbE_lib.tr_10GbE
+    generic map (
+      g_sim           => true,
+      g_sim_level     => 1,
+      g_nof_macs      => 1,
+      g_direction     => "TX_RX",
+      g_tx_fifo_fill  => c_fifo_tx_fill_ring,
+      g_tx_fifo_size  => c_fifo_tx_size_ring
+    )
+    port map (
+      -- Transceiver PLL reference clock
+      tr_ref_clk_644        => SA_CLK,
+      tr_ref_clk_312        => tr_ref_clk_312,
+      tr_ref_clk_156        => tr_ref_clk_156,
+      tr_ref_rst_156        => tr_ref_rst_156,
+
+      -- MM interface
+      mm_rst                => mm_rst,
+      mm_clk                => mm_clk,
+
+      reg_mac_mosi          => c_mem_copi_rst,
+      reg_mac_miso          => open,
+      reg_eth10g_mosi       => c_mem_copi_rst,
+      reg_eth10g_miso       => open,
+
+      -- DP interface
+      dp_rst                => dp_rst,
+      dp_clk                => dp_clk,
+
+      src_out_arr           => tr_10gbe_ring_rx_sosi_arr(RN downto RN),
+      snk_in_arr            => tr_10gbe_ring_tx_sosi_arr(RN downto RN),
+
+      -- Serial IO
+      serial_tx_arr         => tr_10gbe_ring_serial_tx_arr(RN downto RN),
+      serial_rx_arr         => tr_10gbe_ring_serial_rx_arr(RN downto RN)
+    );
+
+    -- Ring lane access at each node
+    u_ring_lane_xst : entity ring_lib.ring_lane
+      generic map (
+        g_lane_direction            => 1,  -- transport in positive RN direction.
+        g_lane_data_w               => c_longword_w,
+        g_lane_packet_length        => c_lane_payload_nof_longwords_xst,
+        g_lane_total_nof_packets_w  => 32,
+        g_use_dp_layer              => true,
+        g_nof_rx_monitors           => g_nof_rn,
+        g_nof_tx_monitors           => g_nof_rn,
+        g_err_bi                    => c_err_bi,
+        g_nof_err_counts            => c_nof_err_counts,
+        g_bsn_at_sync_check_channel => c_bsn_at_sync_check_channel,
+        g_validate_channel          => c_validate_channel,
+        g_validate_channel_mode     => c_validate_channel_mode,
+        g_sync_timeout              => c_sync_timeout
+      )
+      port map (
+        mm_rst => mm_rst,
+        mm_clk => mm_clk,
+        dp_clk => dp_clk,
+        dp_rst => dp_rst,
+
+        from_lane_sosi     => from_ri_sosi_arr(RN),
+        to_lane_sosi       => to_ri_sosi_arr(RN),
+        lane_rx_cable_sosi => tr_10gbe_ring_rx_sosi_arr(RN),
+        lane_rx_board_sosi => c_dp_sosi_rst,
+        lane_tx_cable_sosi => tr_10gbe_ring_tx_sosi_arr(RN),
+        lane_tx_board_sosi => open,
+        bs_sosi            => xst_bs_sosi,  -- used for bsn and sync
+
+        reg_ring_lane_info_copi                => reg_ring_lane_info_xst_copi_arr(RN),
+        reg_ring_lane_info_cipo                => reg_ring_lane_info_xst_cipo_arr(RN),
+        reg_bsn_monitor_v2_ring_rx_copi        => reg_bsn_monitor_v2_ring_rx_xst_copi_arr(RN),
+        reg_bsn_monitor_v2_ring_rx_cipo        => reg_bsn_monitor_v2_ring_rx_xst_cipo_arr(RN),
+        reg_bsn_monitor_v2_ring_tx_copi        => reg_bsn_monitor_v2_ring_tx_xst_copi_arr(RN),
+        reg_bsn_monitor_v2_ring_tx_cipo        => reg_bsn_monitor_v2_ring_tx_xst_cipo_arr(RN),
+        reg_dp_block_validate_err_copi         => reg_dp_block_validate_err_xst_copi_arr(RN),
+        reg_dp_block_validate_err_cipo         => reg_dp_block_validate_err_xst_cipo_arr(RN),
+        reg_dp_block_validate_bsn_at_sync_copi => reg_dp_block_validate_bsn_at_sync_xst_copi_arr(RN),
+        reg_dp_block_validate_bsn_at_sync_cipo => reg_dp_block_validate_bsn_at_sync_xst_cipo_arr(RN),
+
+        this_rn   => to_uvec(RN, c_byte_w),
+        N_rn      => to_uvec(g_nof_rn, c_byte_w),
+        rx_select => c_use_cable,
+        tx_select => c_use_cable
+      );
+
+    -- Intermediate crosslets alignment at each node
+    u_sdp_crosslets_remote : entity work.sdp_crosslets_remote_v2
+      generic map (
+        g_P_sq  => c_P_sq
+      )
+      port map (
+        dp_clk        => dp_clk,
+        dp_rst        => dp_rst,
+
+        xsel_sosi     => local_crosslets_sosi_arr(RN),
+        from_ri_sosi  => from_ri_sosi_arr(RN),
+        to_ri_sosi    => to_ri_sosi_arr(RN),
+
+        crosslets_sosi     => crosslets_sosi_arr(RN),
+        crosslets_copi     => crosslets_copi_arr(RN),
+        crosslets_cipo_arr => crosslets_cipo_2arr(RN),
+
+        mm_rst        => mm_rst,
+        mm_clk        => mm_clk,
+
+        reg_bsn_align_copi                       => reg_bsn_align_v2_xst_copi_arr(RN),
+        reg_bsn_align_cipo                       => reg_bsn_align_v2_xst_cipo_arr(RN),
+        reg_bsn_monitor_v2_bsn_align_input_copi  => reg_bsn_monitor_v2_xst_rx_align_copi_arr(RN),
+        reg_bsn_monitor_v2_bsn_align_input_cipo  => reg_bsn_monitor_v2_xst_rx_align_cipo_arr(RN),
+        reg_bsn_monitor_v2_bsn_align_output_copi => reg_bsn_monitor_v2_xst_aligned_copi_arr(RN),
+        reg_bsn_monitor_v2_bsn_align_output_cipo => reg_bsn_monitor_v2_xst_aligned_cipo_arr(RN)
+      );
+
+    -- MM -> DP
+    u_st_xsq_mm_to_dp : entity st_lib.st_xsq_mm_to_dp
+    generic map(
+      g_nof_streams       => c_P_sq,
+      g_nof_crosslets     => c_sdp_N_crosslets_max,
+      g_nof_signal_inputs => c_sdp_S_pn,
+      g_dsp_data_w        => c_sdp_W_crosslet
+    )
+    port map(
+      rst          => dp_rst,
+      clk          => dp_clk,
+      in_sosi      => crosslets_sosi_arr(RN),
+      mm_mosi      => crosslets_copi_arr(RN),
+      mm_miso_arr  => crosslets_cipo_2arr(RN),
+      out_sosi_arr => x_sosi_2arr(RN)
+    );
+  end generate;  -- gen_dut
+
+  -- View status of x_sosi_2arr
+  p_x_sosi_2arr : process(x_sosi_2arr)
+  begin
+    for RN in 0 to c_last_rn loop
+      -- Group all x_sosi_2arr valids into one slv
+      for P in 0 to c_P_sq - 1 loop
+        x_sosi_2arr_valids(RN * c_P_sq + P) <= x_sosi_2arr(RN)(P).valid;
+      end loop;
+
+      -- Group aligned first output from all RN
+      x_sosi_arr(RN) <= x_sosi_2arr(RN)(0);
+    end loop;
+
+    -- Get aligned first output from first RN
+    x_sosi <= x_sosi_2arr(0)(0);
+  end process;
+
+  ------------------------------------------------------------------------------
+  -- Verify crosslets at every node, to check that no packets were lost
+  ------------------------------------------------------------------------------
+  p_verify_crosslets : process(dp_clk)
+  begin
+    -- Verify that data /= 0, so no lost data = 0 insertion
+    for RN in 0 to c_last_rn Loop
+      for P in 0 to c_P_sq - 1 loop
+        if x_sosi_2arr(RN)(P).valid = '1' then
+          assert TO_SINT(x_sosi_2arr(RN)(P).re) /= 0 report "Wrong crosslet re at node " & int_to_str(RN) severity error;
+          assert TO_SINT(x_sosi_2arr(RN)(P).im) /= 0 report "Wrong crosslet im at node " & int_to_str(RN) severity error;
+        end if;
+      end loop;
+    end loop;
+
+    -- Verify that all aligned outputs on all RN are valid at the same time
+    if x_sosi.valid = '1' then
+      assert vector_and(x_sosi_2arr_valids) = '1' report "Missing aligned output valid" severity error;
+    else
+      assert vector_and(x_sosi_2arr_valids) = '0' report "Unexpected aligned output valid" severity error;
+    end if;
+  end process;
+
+  ------------------------------------------------------------------------------
+  -- 10GbE clocks
+  ------------------------------------------------------------------------------
+  u_tech_pll_xgmii_mac_clocks : entity tech_pll_lib.tech_pll_xgmii_mac_clocks
+  port map (
+    refclk_644 => SA_CLK,
+    rst_in     => mm_rst,
+    clk_156    => tr_ref_clk_156,
+    clk_312    => tr_ref_clk_312,
+    rst_156    => tr_ref_rst_156,
+    rst_312    => open
+  );
+
+  ------------------------------------------------------------------------------
+  -- MM bus multiplexers
+  ------------------------------------------------------------------------------
+  -- Use common_mem_mux to avoid (vcom-1450) Actual (indexed name) for formal "mm_miso" is not a static signal name.
+  -- Use downto range for _arr, to match downto range of mosi_arr.
+  u_mem_mux_reg_ring_lane_info_xst : entity common_lib.common_mem_mux
+  generic map (
+    g_nof_mosi    => g_nof_rn,
+    g_mult_addr_w => c_addr_w_reg_ring_lane_info_xst
+  )
+  port map (
+    mosi     => reg_ring_lane_info_xst_copi,
+    miso     => reg_ring_lane_info_xst_cipo,
+    mosi_arr => reg_ring_lane_info_xst_copi_arr,
+    miso_arr => reg_ring_lane_info_xst_cipo_arr
+  );
+
+  u_mem_mux_reg_bsn_monitor_v2_ring_rx_xst : entity common_lib.common_mem_mux
+  generic map (
+    g_nof_mosi    => g_nof_rn,
+    g_mult_addr_w => c_sdp_reg_bsn_monitor_v2_addr_w + ceil_log2(g_nof_rn)
+  )
+  port map (
+    mosi     => reg_bsn_monitor_v2_ring_rx_xst_copi,
+    miso     => reg_bsn_monitor_v2_ring_rx_xst_cipo,
+    mosi_arr => reg_bsn_monitor_v2_ring_rx_xst_copi_arr,
+    miso_arr => reg_bsn_monitor_v2_ring_rx_xst_cipo_arr
+  );
+
+  u_mem_mux_reg_bsn_monitor_v2_ring_tx_xst : entity common_lib.common_mem_mux
+  generic map (
+    g_nof_mosi    => g_nof_rn,
+    g_mult_addr_w => c_sdp_reg_bsn_monitor_v2_addr_w + ceil_log2(g_nof_rn)
+  )
+  port map (
+    mosi     => reg_bsn_monitor_v2_ring_tx_xst_copi,
+    miso     => reg_bsn_monitor_v2_ring_tx_xst_cipo,
+    mosi_arr => reg_bsn_monitor_v2_ring_tx_xst_copi_arr,
+    miso_arr => reg_bsn_monitor_v2_ring_tx_xst_cipo_arr
+  );
+
+  u_mem_mux_reg_bsn_monitor_v2_xst_rx_align : entity common_lib.common_mem_mux
+  generic map (
+    g_nof_mosi    => g_nof_rn,
+    g_mult_addr_w => c_sdp_reg_bsn_monitor_v2_addr_w + ceil_log2(c_P_sq)
+  )
+  port map (
+    mosi     => reg_bsn_monitor_v2_xst_rx_align_copi,
+    miso     => reg_bsn_monitor_v2_xst_rx_align_cipo,
+    mosi_arr => reg_bsn_monitor_v2_xst_rx_align_copi_arr,
+    miso_arr => reg_bsn_monitor_v2_xst_rx_align_cipo_arr
+  );
+
+  u_mem_mux_reg_bsn_monitor_v2_xst_aligned : entity common_lib.common_mem_mux
+  generic map (
+    g_nof_mosi    => g_nof_rn,
+    g_mult_addr_w => c_sdp_reg_bsn_monitor_v2_addr_w
+  )
+  port map (
+    mosi     => reg_bsn_monitor_v2_xst_aligned_copi,
+    miso     => reg_bsn_monitor_v2_xst_aligned_cipo,
+    mosi_arr => reg_bsn_monitor_v2_xst_aligned_copi_arr,
+    miso_arr => reg_bsn_monitor_v2_xst_aligned_cipo_arr
+  );
+
+  u_mem_mux_reg_bsn_align_v2_xst : entity common_lib.common_mem_mux
+  generic map (
+    g_nof_mosi    => g_nof_rn,
+    g_mult_addr_w => c_sdp_reg_bsn_align_v2_addr_w
+  )
+  port map (
+    mosi     => reg_bsn_align_v2_xst_copi,
+    miso     => reg_bsn_align_v2_xst_cipo,
+    mosi_arr => reg_bsn_align_v2_xst_copi_arr,
+    miso_arr => reg_bsn_align_v2_xst_cipo_arr
+  );
+end tb;
diff --git a/libraries/base/common/src/vhdl/common_str_pkg.vhd b/libraries/base/common/src/vhdl/common_str_pkg.vhd
index e309a4d10a00d1432c3c166ae1b0bc0174f251c9..8aad8788922126a200c54d4bbfc735cb55ef384f 100644
--- a/libraries/base/common/src/vhdl/common_str_pkg.vhd
+++ b/libraries/base/common/src/vhdl/common_str_pkg.vhd
@@ -53,6 +53,7 @@ package common_str_pkg is
   function hex_nibble_to_slv(c: character) return std_logic_vector;
 
   function int_to_str(int: integer) return string;
+  function int_to_str(int, w: integer) return string;
   function real_to_str(re: real; width : integer; digits : integer) return string;
 
   procedure print_str(str : string);
@@ -222,10 +223,9 @@ package body common_str_pkg is
       when 'X' => v_result :=  "XXXX";
       when 'z' => v_result :=  "ZZZZ";
       when 'Z' => v_result :=  "ZZZZ";
-
-	    when others => v_result := "0000";
-     end case;
-   return v_result;
+	  when others => v_result := "0000";
+    end case;
+    return v_result;
   end hex_nibble_to_slv;
 
   function int_to_str(int: integer) return string is
@@ -238,6 +238,19 @@ package body common_str_pkg is
     return v_str;
   end;
 
+  function int_to_str(int, w: integer) return string is
+    constant c_len: natural := nof_digits_int(int);
+    variable v_line: LINE;
+    variable v_str: string(1 to c_len) := (others => ' ');
+    variable v_ret: string(1 to w) := (others => ' ');
+  begin
+    STD.TEXTIO.WRITE(v_line, int);
+    v_str(v_line.ALL'range) := v_line.all;
+    deallocate(v_line);
+    v_ret(w - c_len + 1 to w) := v_str;  -- right align v_str in v_ret
+    return v_ret;
+  end;
+
   function real_to_str(re: real; width : integer; digits : integer) return string is
     -- . The number length is width + 1, with +1 for the . in the floating point number.
     --   However if width is too small to fit the number, then it will use more characters.
diff --git a/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd b/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd
index 9042f050f7797b53fec3194857d70a0b6aac009c..61db3174e6f4f846ff445cecef3a07421177f3b2 100644
--- a/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd
+++ b/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd
@@ -20,14 +20,43 @@
 -- Purpose :
 --   Align frames from multiple input streams
 -- Description:
---   The aligner uses a circular buffer to capture the blocks that arrive at
---   the input streams. The blocks have a block sequence number (BSN) that
---   is used to align the inputs. The input stream 0 is treated as local
---   input stream that is ahead of the other remote input streams. After a
---   certain number of blocks on input 0, the same block on all remote
---   inputs should also have arrived. If not then they are replaced by
---   replacement data. The output streams are paced by the block rate of
---   input 0. The user has to read the block within the block period.
+--   Aligner:
+--   . The aligner uses a circular buffer to capture the blocks that arrive at
+--     the input streams. The blocks have a block sequence number (BSN) that
+--     is used to align the inputs. The input stream 0 is treated as local
+--     input stream that is ahead of the other remote input streams. After a
+--     certain number of blocks on input 0, the same block on all remote
+--     inputs should also have arrived. If not then they are replaced by
+--     replacement data. The output streams are paced by the block rate of
+--     input 0. The user has to read the block within the block period.
+--   . The aligner can align g_nof_streams that all arrive within a latency
+--     of g_bsn_latency_max after the local stream at index 0. The aligner
+--     can also be used in a chain of aligners, whereby each aligner typically
+--     has the local input and one remote input and the remote input is the
+--     output of an upstream aligner. Then the latency on the last node in
+--     the chain will be within g_nof_aligners_max * g_bsn_latency_max.
+--
+--   Circular buffer:
+--   . The size of the circular buffer is c_buffer_nof_blocks and depends on
+--     the maximum latency. The c_buffer_nof_blocks has to a power of two to
+--     ease the control of the circular buffer. The lowest bits of the input
+--     block sequence number (BSN) are used as write block index into the
+--     circular buffer. The g_bsn_latency_first_node can be useful to reduce
+--     the required circular buffer size just enough, such that the next power
+--     of two is only a few blocks larger, instead of almost a factor two
+--     larger. This then can save a significant amount of block RAM.
+--     For example: The circular buffer size c_buffer_nof_blocks is 1 + the
+--     sum of bsn latencies at each node. Therefor if g_nof_aligners_max = 16
+--     (a power of two) and g_bsn_latency_max = 2, then the circular buffer
+--     becomes true_log_pow2(1 + 16 * 2) = 64 blocks, so almost twice as large
+--     as needed. If the first input stream does not have active remote input,
+--     or is disabled via stream_en_arr, then choose g_bsn_latency_first_node
+--     = 1, to get a buffer size of true_log_pow2(1 + 15 * 2 + 1) = 32 blocks.
+--   . In case of a chain of aligners then the circular buffer size depends on
+--     the latency of local input. The most remote input will only use a
+--     fraction of the buffer. Therefore more block RAM can be saved by using
+--     a smaller circular buffer size for signal inputs that are from more
+--     remote (i.e. that have passed through more upstream aligners).
 --
 --   Features:
 --   . The g_block_size <= block period, so supports input blocks arriving
@@ -35,8 +64,44 @@
 --   . uses replacement data to replace lost input blocks and channel bit 0 as
 --     lost_data flag
 --   . uses replacement data to replace disabled input streams
---   . output block can be read in arbitrary order via g_use_mm_output = TRUE
---   . output block can be streamed via g_use_mm_output = FALSE
+--   . output block can be read in arbitrary order via g_use_mm_output = true
+--   . output block can be streamed via g_use_mm_output = false
+--
+--   Parameters:
+--   . g_nof_streams: number of input and output streams. Stream index 0 is
+--     the local stream. Streams index > 0 is for remote streams. The
+--     remote streams arrive later than the local stream, but within
+--     g_bsn_latency_max or within an integer multiple of g_bsn_latency_max.
+--   . g_bsn_latency_max: >= 1, maximum travel latency of a remote block in
+--     number of block periods T_blk.
+--   . g_bsn_latency_first_node: typically <= g_bsn_latency_max of the other
+--     nodes in a chain. Use g_bsn_latency_first_node = 0 for immediate
+--     output from first node in a chain of nodes. Only used when
+--     g_nof_aligners_max > 1. The g_bsn_latency_first_node setting only
+--     affects the latency along the chain, and therefore the required
+--     size of the circular buffer. If the circular buffer is large enough
+--     anyway, then the g_bsn_latency_first_node setting is don't care,
+--     assuming that a little extra latency is don't care.
+--   . g_nof_aligners_max: Number of dp_bsn_align_v2 aligners in a chain.
+--     = 1 when only align at last node, or
+--     > 1 when align at every intermediate node in a chain of nodes, and then
+--         g_nof_aligners_max should equal the number of nodes for
+--         chain_node_index range. The g_nof_aligners_max is the number of
+--         nodes in the chain including the first node.
+--
+--   Inputs:
+--   . chain_node_index: Node index in chain of nodes. First node has index 0.
+--     In case of a ring of nodes the chain of nodes can range the whole ring,
+--     or only a part of the ring. The number of nodes in the chain is given
+--     by g_nof_aligners_max. Only used when g_nof_aligners_max > 1.
+--   . stream_en_arr: when '1' then align corresponding input stream, else
+--     replace data from corresponding inut stream by 0 and do not raise the
+--     lost data flag. Whether a stream is enabled or not has no effect on the
+--     aligner timing, it only sets the data to 0.
+--
+--   Outputs:
+--   . replace_cnt_en_arr: count number of lost data blocks per input stream,
+--     that got replaced by 0 value, per sync interval.
 --
 --   For more detailed description see:
 --   https://support.astron.nl/confluence/display/L2M/L6+FWLIB+Design+Document%3A+BSN+aligner+v2
@@ -46,6 +111,13 @@
 --   APERTIF. Main differences are that the old component uses FIFO buffers,
 --   timeouts and states, and v2 does not, which makes v2 simpler and more
 --   robust.
+-- . The g_bsn_latency_first_node = 0 should also be feasible, but does not
+--   work and is not investigated further, because g_bsn_latency_first_node =
+--   1 in combination with g_bsn_latency_max = 2 is sufficient to reduce the
+--   circular buffer size when g_nof_aligners_max is a power of two.
+-- . Using a circular buffer with optimum size, that does not have to have a
+--   power of two number of blocks, makes the circular buffer control and
+--   access more complicated and is not investigated further.
 
 library IEEE,common_lib;
 use IEEE.std_logic_1164.all;
@@ -57,22 +129,23 @@ use work.dp_stream_pkg.all;
 entity dp_bsn_align_v2 is
   generic (
     g_nof_streams                : natural := 2;  -- >= 2, number of input and output streams
-    g_bsn_latency_max            : natural := 2;  -- maximum travel latency of a remote block in number of block periods T_blk
-    g_nof_aligners_max           : positive := 16;  -- 1 when only align at last node, > 1 when align at every intermediate node
+    g_bsn_latency_max            : natural := 2;  -- >= 1
+    g_bsn_latency_first_node     : natural := 2;  -- default use same as g_bsn_latency_max
+    g_nof_aligners_max           : positive := 16;
     g_block_size                 : natural := 1024;  -- > 1, g_block_size=1 is not supported
     g_bsn_w                      : natural := c_dp_stream_bsn_w;  -- number of bits in sosi BSN
     g_data_w                     : natural := 36;  -- number of bits in sosi data
     g_data_replacement_value     : integer := 0;  -- output sosi data value for missing input blocks
     g_use_mm_output              : boolean := false;  -- output via MM or via streaming DP
-    g_pipeline_input             : natural := 1;  -- >= 0, choose 0 for wires, choose 1 to ease timing closure of in_sosi_arr
-    g_pipeline_output            : natural := 1;  -- >= 0, choose 0 for wires, choose 1 to ease timing closure of out_sosi_arr
+    g_pipeline_input             : natural := 1;  -- >= 0, 0 for wires, 1 to ease timing closure of in_sosi_arr
+    g_pipeline_output            : natural := 1;  -- >= 0, 0 for wires, 1 to ease timing closure of out_sosi_arr
     g_rd_latency                 : natural := 2  -- 1 or 2, choose 2 to ease timing closure
   );
   port (
     dp_rst         : in  std_logic;
     dp_clk         : in  std_logic;
 
-    node_index     : in  natural range 0 to g_nof_aligners_max - 1 := 0;  -- only used when g_nof_aligners_max > 1
+    chain_node_index : in  natural range 0 to g_nof_aligners_max - 1 := 0;
 
     -- MM control
     stream_en_arr            : in  std_logic_vector(g_nof_streams - 1 downto 0) := (others => '1');
@@ -81,19 +154,23 @@ entity dp_bsn_align_v2 is
     -- Streaming input
     in_sosi_arr    : in  t_dp_sosi_arr(g_nof_streams - 1 downto 0);
 
-    -- Output via local MM interface in dp_clk domain, when g_use_mm_output = TRUE.
-    mm_sosi        : out t_dp_sosi;  -- streaming information that signals that an output block can be read
-    mm_copi        : in  t_mem_copi := c_mem_copi_rst;  -- read access to output block, all output streams share same mm_copi
+    -- Output via local MM interface in dp_clk domain, when g_use_mm_output = true
+    -- . streaming information that signals that an output block can be read
+    mm_sosi        : out t_dp_sosi;
+    -- . MM read access to output block, all output streams share same mm_copi
+    mm_copi        : in  t_mem_copi := c_mem_copi_rst;
     mm_cipo_arr    : out t_mem_cipo_arr(g_nof_streams - 1 downto 0);
 
-    -- Output via streaming DP interface, when g_use_mm_output = FALSE.
+    -- Output via streaming DP interface, when g_use_mm_output = false.
     out_sosi_arr   : out t_dp_sosi_arr(g_nof_streams - 1 downto 0)
   );
 end dp_bsn_align_v2;
 
 architecture rtl of dp_bsn_align_v2 is
-  -- Circular buffer per stream, size is next power of 2 that fits
-  constant c_buffer_nof_blocks : natural :=  true_log_pow2(1 + g_nof_aligners_max * g_bsn_latency_max);
+  -- Circular buffer per stream, size is next power of two that fits
+  constant c_buffer_nof_blocks : natural := sel_a_b(g_nof_aligners_max = 1,
+           true_log_pow2(1 + g_bsn_latency_max),
+           true_log_pow2(1 + g_bsn_latency_max * (g_nof_aligners_max - 1) + g_bsn_latency_first_node));
 
   constant c_ram_size       : natural := c_buffer_nof_blocks * g_block_size;
   constant c_ram_buf        : t_c_mem := (latency  => 1,
@@ -102,7 +179,7 @@ architecture rtl of dp_bsn_align_v2 is
                                           nof_dat  => c_ram_size,
                                           init_sl  => '0');
 
-  -- Use +1 to ensure that g_block_size that is power of 2 also fits in c_block_size_slv
+  -- Use +1 to ensure that g_block_size that is power of two also fits in c_block_size_slv
   constant c_block_size_w   : natural := ceil_log2(g_block_size + 1);
   constant c_block_size_slv : std_logic_vector(c_block_size_w - 1 downto 0) := TO_UVEC(g_block_size, c_block_size_w);
   constant c_blk_pointer_w  : natural := ceil_log2(c_buffer_nof_blocks);
@@ -121,6 +198,7 @@ architecture rtl of dp_bsn_align_v2 is
 
   -- State
   type t_reg is record
+    ref_sosi             : t_dp_sosi;
     -- p_write_arr
     wr_blk_pointer       : natural;
     wr_copi_arr          : t_mem_copi_arr(g_nof_streams - 1 downto 0);
@@ -136,9 +214,12 @@ architecture rtl of dp_bsn_align_v2 is
     rd_blk_pointer       : integer;  -- use integer to detect need to wrap to natural
     rd_offset            : std_logic_vector(c_ram_buf.adr_w - 1 downto 0);
     rd_copi              : t_mem_copi;
-    fill_cipo_arr        : t_mem_cipo_arr(g_nof_streams - 1 downto 0);  -- used combinatorial to contain rd_cipo_arr from buffer or replacement data
-    out_bsn              : std_logic_vector(g_bsn_w - 1 downto 0);  -- hold BSN until next sop, for easy view in Wave window
-    out_channel_arr      : t_channel_arr(g_nof_streams - 1 downto 0);  -- hold channel until next sop per stream, for easy view in Wave window
+    fill_cipo_arr        : t_mem_cipo_arr(g_nof_streams - 1 downto 0);  -- used combinatorial to contain rd_cipo_arr
+                                                                        -- from buffer or replacement data
+    out_bsn              : std_logic_vector(g_bsn_w - 1 downto 0);  -- hold BSN until next sop, for easy view in Wave
+                                                                    -- window
+    out_channel_arr      : t_channel_arr(g_nof_streams - 1 downto 0);  -- hold channel until next sop per stream, for
+                                                                       -- easy view in Wave window
     replace_cnt_en_arr   : std_logic_vector(g_nof_streams - 1 downto 0);
   end record;
 
@@ -146,16 +227,16 @@ architecture rtl of dp_bsn_align_v2 is
   -- . For unique representation as signal wire, the p_comb should assign each
   --   field in t_comb only once to a variable. It is allowed to reasign a
   --   t_comb variable in p_comb, but then only the last assignment value will
-  --   be visible via the signal dbg_wires in the Wave window.
+  --   be visible via the signal w_comb in the Wave window.
   type t_comb is record
-    ref_sosi            : t_dp_sosi;
     blk_pointer_slv     : std_logic_vector(c_blk_pointer_w - 1 downto 0);
     product_slv         : std_logic_vector(c_product_w - 1 downto 0);
     lost_data_flags_arr : std_logic_vector(g_nof_streams - 1 downto 0);
     out_sosi_arr        : t_dp_sosi_arr(g_nof_streams - 1 downto 0);
   end record;
 
-  constant c_reg_rst  : t_reg := (0,
+  constant c_reg_rst  : t_reg := (c_dp_sosi_rst,
+                                  0,
                                   (others => c_mem_copi_rst),
                                   (others => (others => '0')),
                                   (others => '0'),
@@ -171,18 +252,17 @@ architecture rtl of dp_bsn_align_v2 is
                                   (others => (others => '0')),
                                   (others => '0'));
 
-  constant c_comb_rst  : t_comb := (c_dp_sosi_rst,
-                                   (others => '0'),
-                                   (others => '0'),
-                                   (others => '0'),
-                                   (others => c_dp_sosi_rst));
+  constant c_comb_rst  : t_comb := ((others => '0'),
+                                    (others => '0'),
+                                    (others => '0'),
+                                    (others => c_dp_sosi_rst));
 
   -- State registers for p_comb
   signal r                 : t_reg;
   signal nxt_r             : t_reg;
 
   -- Memoryless signals in p_comb (wires used as local variables)
-  signal dbg_wires         : t_comb;
+  signal w_comb            : t_comb;
 
   -- Structural signals (wires used to connect components and IO)
   signal dp_done           : std_logic;
@@ -199,7 +279,6 @@ architecture rtl of dp_bsn_align_v2 is
   signal comb_out_sosi_arr : t_dp_sosi_arr(g_nof_streams - 1 downto 0);
 
   -- Counter signals
-
   signal replace_cnt_arr          : t_slv_32_arr(g_nof_streams - 1 downto 0);
   signal nxt_hold_replace_cnt_arr : t_slv_32_arr(g_nof_streams - 1 downto 0);
   signal hold_replace_cnt_arr     : t_slv_32_arr(g_nof_streams - 1 downto 0);
@@ -230,7 +309,7 @@ begin
     end if;
   end process;
 
-  p_comb : process(r, in_sosi_arr_p, mm_copi, dp_copi, rd_cipo_arr, rd_sosi_arr, stream_en_arr, node_index)
+  p_comb : process(r, in_sosi_arr_p, mm_copi, dp_copi, rd_cipo_arr, rd_sosi_arr, stream_en_arr, chain_node_index)
     variable v : t_reg;  -- State variable
     variable w : t_comb;  -- Local wires = memoryless auxiliary variables
   begin
@@ -264,23 +343,32 @@ begin
       end if;
     end loop;
 
-    ----------------------------------------------------------------------------
+    ---------------------------------------------------------------------------
     -- p_control, all at sop of local reference input 0
-    ----------------------------------------------------------------------------
-    w.ref_sosi := in_sosi_arr_p(0);
-    if w.ref_sosi.sop = '1' then
+    ---------------------------------------------------------------------------
+    v.ref_sosi := in_sosi_arr_p(0);
+    -- Use v.ref_sosi.sop instead of r.ref_sosi.sop, to support alignment of
+    -- streams that have no data valid gap between blocks, so when
+    -- g_block_size is equal to the block period or when shorter blocks have
+    -- jitter in arrival time that could cause two blocks to arrive without a
+    -- gap.
+    if v.ref_sosi.sop = '1' then
       -- . write sync & bsn buffer
-      v.wr_blk_pointer := TO_UINT(w.ref_sosi.bsn(c_blk_pointer_w - 1 downto 0));
-      v.sync_arr(v.wr_blk_pointer) := w.ref_sosi.sync;
-      v.bsn_arr(v.wr_blk_pointer) := w.ref_sosi.bsn(g_bsn_w - 1 downto 0);
+      v.wr_blk_pointer := TO_UINT(v.ref_sosi.bsn(c_blk_pointer_w - 1 downto 0));
+      v.sync_arr(v.wr_blk_pointer) := v.ref_sosi.sync;
+      v.bsn_arr(v.wr_blk_pointer) := v.ref_sosi.bsn(g_bsn_w - 1 downto 0);
 
       -- . update read block pointer at g_bsn_latency_max blocks behind the
-      --   reference write pointer, dependent on the node_index. For
-      --   g_bsn_latency_max = 1 the node_index = 0 fixed. For
-      --   g_bsn_latency_max > 1, node_index is the first BSN aligner in a
-      --   chain. Each subsequent node in the chain then has to account for
-      --   g_bsn_latency_max additional block latency.
-      v.rd_blk_pointer := v.wr_blk_pointer - g_bsn_latency_max * (1 + node_index);
+      --   reference write pointer, dependent on the chain_node_index:
+      --   - for g_nof_aligners_max = 1 the chain_node_index = 0 fixed
+      --   - for g_nof_aligners_max > 1, chain_node_index = 0 is the first BSN
+      --     aligner in a chain. Each subsequent node in the chain then has to
+      --     account for g_bsn_latency_max additional block latency.
+      if g_nof_aligners_max = 1 then
+        v.rd_blk_pointer := v.wr_blk_pointer - g_bsn_latency_max;
+      else
+        v.rd_blk_pointer := v.wr_blk_pointer - g_bsn_latency_max * chain_node_index - g_bsn_latency_first_node;
+      end if;
       if v.rd_blk_pointer < 0 then
         v.rd_blk_pointer := v.rd_blk_pointer + c_buffer_nof_blocks;
       end if;
@@ -291,6 +379,10 @@ begin
       v.rd_offset := RESIZE_UVEC(w.product_slv, c_ram_buf.adr_w);
 
       -- . issue mm_sosi, if there is output ready to be read, indicated by filled reference block
+      --   - can use 'if r.filled_arr(0)' instead of 'if v.filled_arr(0)',
+      --     because input stream 0 arrives first, so is already filled
+      --   - need to use 'not v.filled_arr(I)' for w.lost_data_flags_arr(I),
+      --     because last input I = g_nof_streams - 1 may just got filled.
       if r.filled_arr(0)(v.rd_blk_pointer) = '1' then
         v.mm_sosi.sop := '1';
         v.mm_sosi.eop := '1';
@@ -339,7 +431,7 @@ begin
       -- Do the output via the MM interface
       --------------------------------------------------------------------------
       -- . adjust the rd address to the current buffer output block
-      --   sum yields c_ram_buf.adr_w bits, because left operand in ADD_UVECdetermines width
+      --   sum yields c_ram_buf.adr_w bits, because left operand in ADD_UVEC determines width
       v.rd_copi := mm_copi;
       v.rd_copi.address := RESIZE_MEM_ADDRESS(ADD_UVEC(r.rd_offset, mm_copi.address));
 
@@ -353,7 +445,7 @@ begin
       -- Do the output via the DP streaming interface
       --------------------------------------------------------------------------
       -- . adjust the rd address
-      --   sum yields c_ram_buf.adr_w bits, because left operand in ADD_UVECdetermines width
+      --   sum yields c_ram_buf.adr_w bits, because left operand in ADD_UVEC determines width
       v.rd_copi := dp_copi;
       v.rd_copi.address := RESIZE_MEM_ADDRESS(ADD_UVEC(r.rd_offset, dp_copi.address));
 
@@ -401,7 +493,7 @@ begin
     nxt_r <= v;
 
     -- local wires, only for view in wave window
-    dbg_wires <= w;
+    w_comb <= w;
   end process;
 
   ------------------------------------------------------------------------------
diff --git a/libraries/base/dp/src/vhdl/mmp_dp_bsn_align_v2.vhd b/libraries/base/dp/src/vhdl/mmp_dp_bsn_align_v2.vhd
index 20456ab387ba3873e6a87d8843bafd78250ac876..d7803fb866b75af0a44f18595e8934a5db1b11b5 100644
--- a/libraries/base/dp/src/vhdl/mmp_dp_bsn_align_v2.vhd
+++ b/libraries/base/dp/src/vhdl/mmp_dp_bsn_align_v2.vhd
@@ -44,16 +44,18 @@ use work.dp_stream_pkg.all;
 entity mmp_dp_bsn_align_v2 is
   generic (
     -- for dp_bsn_align_v2
-    g_nof_streams                : natural;  -- number of input and output streams
-    g_bsn_latency_max            : natural;  -- Maximum travel latency of a remote block in number of block periods T_blk
-    g_nof_aligners_max           : natural := 1;  -- 1 when only align at last node, > 1 when align at every intermediate node
+    g_nof_streams                : natural := 2;  -- number of input and output streams
+    g_bsn_latency_max            : natural := 2;  -- Maximum travel latency of a remote block in number of block periods
+    g_bsn_latency_first_node     : natural := 2;  -- default use same as g_bsn_latency_max
+    g_nof_aligners_max           : natural := 1;  -- 1 when only align at last node,
+                                                  -- > 1 when align at every intermediate node
     g_block_size                 : natural := 32;  -- > 1, g_block_size=1 is not supported
     g_bsn_w                      : natural := c_dp_stream_bsn_w;  -- number of bits in sosi BSN
     g_data_w                     : natural;  -- number of bits in sosi data
     g_data_replacement_value     : integer := 0;  -- output sosi data value for missing input blocks
     g_use_mm_output              : boolean := false;  -- output via MM or via streaming DP
-    g_pipeline_input             : natural := 1;  -- >= 0, choose 0 for wires, choose 1 to ease timing closure of in_sosi_arr
-    g_pipeline_output            : natural := 1;  -- >= 0, choose 0 for wires, choose 1 to ease timing closure of out_sosi_arr
+    g_pipeline_input             : natural := 1;  -- >= 0, 0 for wires, 1 to ease timing closure of in_sosi_arr
+    g_pipeline_output            : natural := 1;  -- >= 0, 0 for wires, 1 to ease timing closure of out_sosi_arr
     g_rd_latency                 : natural := 2;  -- 1 or 2, choose 2 to ease timing closure
     -- for mms_dp_bsn_monitor_v2
     g_nof_clk_per_sync           : natural := 200 * 10**6;
@@ -78,17 +80,17 @@ entity mmp_dp_bsn_align_v2 is
     dp_rst                  : in  std_logic;
     dp_clk                  : in  std_logic;
 
-    node_index              : in  natural range 0 to g_nof_aligners_max - 1 := 0;  -- only used when g_nof_aligners_max > 1
+    chain_node_index       : in  natural range 0 to g_nof_aligners_max - 1 := 0;  -- only used when g_nof_aligners_max > 1
 
     -- Streaming input
     in_sosi_arr             : in  t_dp_sosi_arr(g_nof_streams - 1 downto 0);
 
-    -- Output via local MM interface in dp_clk domain, when g_use_mm_output = TRUE.
+    -- Output via local MM interface in dp_clk domain, when g_use_mm_output = true.
     mm_sosi                 : out t_dp_sosi;  -- streaming information that signals that an output block can be read
     mm_copi                 : in  t_mem_copi := c_mem_copi_rst;  -- read access to output block, all output streams share same mm_copi
     mm_cipo_arr             : out t_mem_cipo_arr(g_nof_streams - 1 downto 0);
 
-    -- Output via streaming DP interface, when g_use_mm_output = FALSE.
+    -- Output via streaming DP interface, when g_use_mm_output = false.
     out_sosi_arr            : out t_dp_sosi_arr(g_nof_streams - 1 downto 0)
   );
 end mmp_dp_bsn_align_v2;
@@ -220,6 +222,7 @@ begin
   generic map (
     g_nof_streams                => g_nof_streams,
     g_bsn_latency_max            => g_bsn_latency_max,
+    g_bsn_latency_first_node     => g_bsn_latency_first_node,
     g_nof_aligners_max           => g_nof_aligners_max,
     g_block_size                 => g_block_size,
     g_bsn_w                      => g_bsn_w,
@@ -233,7 +236,7 @@ begin
   port map (
     dp_rst                  => dp_rst,
     dp_clk                  => dp_clk,
-    node_index              => node_index,
+    chain_node_index        => chain_node_index,
     -- MM control
     stream_en_arr           => stream_en_arr,
     stream_replaced_cnt_arr => stream_replaced_cnt_arr,
diff --git a/libraries/base/dp/tb/vhdl/tb_dp_bsn_align_v2.vhd b/libraries/base/dp/tb/vhdl/tb_dp_bsn_align_v2.vhd
index c6db289f9ed18f783f7200c0011a250f934233b7..7d5f7e3d695acabc3aa4042abfabf20d51b17ab4 100644
--- a/libraries/base/dp/tb/vhdl/tb_dp_bsn_align_v2.vhd
+++ b/libraries/base/dp/tb/vhdl/tb_dp_bsn_align_v2.vhd
@@ -34,7 +34,7 @@
 --   . g_lost_bsn_id to loose a single block in stream 1 and verify that
 --     it gets replaced and flagged.
 --   . array of one or more BSN aligners via g_nof_aligners_max >= 1,
---     using node_index_arr, only support tb for g_use_mm_output = FALSE
+--     using chain_node_index_arr, only support tb for g_use_mm_output = false
 -- Remark:
 --   For this BSN aligner component it was essential to have an almost
 --   complete, reviewed, detailed design document, because it is a complex
@@ -85,8 +85,9 @@ entity tb_dp_bsn_align_v2 is
   generic (
     -- DUT
     g_nof_streams            : natural := 2;  -- number of input and output streams
-    g_bsn_latency_max        : natural := 1;  -- Maximum travel latency of a remote block in number of block periods T_blk
-    g_nof_aligners_max       : positive := 1;  -- 1 when only align at last node, > 1 when align at every intermediate node
+    g_bsn_latency_max        : natural := 2;  -- Maximum travel latency of a remote block in number of block periods
+    g_bsn_latency_first_node : natural := 1;
+    g_nof_aligners_max       : positive := 8;  -- 1 when only align at last node, > 1 when align at every intermediate node
     g_block_size             : natural := 11;  -- > 1, g_block_size=1 is not supported
     g_block_period           : natural := 20;  -- >= g_block_size, = g_block_size + c_gap_size
     g_bsn_w                  : natural := c_dp_stream_bsn_w;  -- number of bits in sosi BSN
@@ -105,7 +106,8 @@ entity tb_dp_bsn_align_v2 is
     g_tb_diff_delay          : integer := 0;  -- 0 = aligned inputs, -1 = max input delay for no loss,
                                                    -- >~ g_bsn_latency_max * g_block_period will give loss
     g_tb_nof_restart         : natural := 2;  -- number of times to restart the input stimuli
-    g_tb_nof_blocks          : natural := 20  -- number of input blocks per restart
+    g_tb_nof_blocks          : natural := 30  -- number of input blocks per restart, choose > circular buffer size, so
+                                              -- > c_align_latency_nof_blocks
   );
 end tb_dp_bsn_align_v2;
 
@@ -139,7 +141,11 @@ architecture tb of tb_dp_bsn_align_v2 is
 
   constant c_gap_size                 : natural := g_block_period - g_block_size;
 
-  constant c_lost_bsn_stream_id       : natural := sel_a_b(g_nof_streams > 1, 1, 0);  -- fixed use stream 1 to verify g_lost_bsn_id. Use 0 for g_nof_streams = 1.
+  -- Fixed use stream 1 to verify g_lost_bsn_id. Use 0 for g_nof_streams = 1.
+  constant c_lost_bsn_stream_id       : natural := sel_a_b(g_nof_streams > 1, 1, 0);
+
+  -- In tb no support (yet) for immediate aligned output at first node, when c_nof_aligners_max > 1
+  constant c_use_aligner_at_first_node : boolean := true;
 
   -- In the tb only support MM interface verification for c_nof_aligners_max = 1
   constant c_nof_aligners_max  : positive := sel_a_b(g_use_mm_output, 1, g_nof_aligners_max);
@@ -151,9 +157,11 @@ architecture tb of tb_dp_bsn_align_v2 is
   constant c_dut_latency              : natural := g_pipeline_input + g_rd_latency + c_mm_to_dp_latency + g_pipeline_output;
 
   -- DUT buffer latency for chain of DUTs
-  constant c_align_latency_nof_blocks : natural := g_bsn_latency_max * c_nof_aligners_max;  -- in number blocks
-  constant c_align_latency_nof_valid  : natural := g_bsn_latency_max * c_nof_aligners_max * g_block_size;  -- in number of data samples
-  constant c_align_latency_nof_clk    : natural := g_bsn_latency_max * c_nof_aligners_max * g_block_period;  -- in number clk cycles
+  constant c_align_latency_nof_blocks : natural := sel_a_b(c_nof_aligners_max = 1,
+               g_bsn_latency_max,
+               g_bsn_latency_max * (c_nof_aligners_max - 1) + g_bsn_latency_first_node);  -- number blocks
+  constant c_align_latency_nof_valid  : natural := c_align_latency_nof_blocks * g_block_size;  -- number of data samples
+  constant c_align_latency_nof_clk    : natural := c_align_latency_nof_blocks * g_block_period;  -- number clk cycles
 
   -- Total DUT chain latency
   constant c_total_latency            : natural := c_dut_latency + c_align_latency_nof_clk;
@@ -181,7 +189,7 @@ architecture tb of tb_dp_bsn_align_v2 is
   signal rst                   : std_logic := '1';
   signal sl1                   : std_logic := '1';
 
-  signal node_index_arr        : t_nat_natural_arr(0 to c_nof_aligners_max - 1) := array_init(0, c_nof_aligners_max, 1);
+  signal chain_node_index_arr  : t_nat_natural_arr(0 to c_nof_aligners_max - 1) := array_init(0, c_nof_aligners_max, 1);
 
   signal stream_en_arr         : std_logic_vector(g_nof_streams - 1 downto 0) := (others => '1');  -- default all streams are enabled
   signal stream_lost_arr       : std_logic_vector(g_nof_streams - 1 downto 0) := (others => '0');  -- default no streams are lost
@@ -501,6 +509,7 @@ begin
   generic map (
     g_nof_streams                => g_nof_streams,
     g_bsn_latency_max            => g_bsn_latency_max,
+    g_bsn_latency_first_node     => g_bsn_latency_first_node,
     g_nof_aligners_max           => c_nof_aligners_max,
     g_block_size                 => g_block_size,
     g_bsn_w                      => g_bsn_w,
@@ -515,8 +524,8 @@ begin
     dp_rst         => rst,
     dp_clk         => clk,
     -- Control
-    node_index     => node_index_arr(0),
-    stream_en_arr  => stream_en_arr,
+    chain_node_index => chain_node_index_arr(0),
+    stream_en_arr    => stream_en_arr,
     -- Streaming input
     in_sosi_arr    => dut_in_sosi_2arr(0),
     -- Output via local MM interface in dp_clk domain
@@ -553,6 +562,7 @@ begin
     generic map (
       g_nof_streams                => g_nof_streams,
       g_bsn_latency_max            => g_bsn_latency_max,
+      g_bsn_latency_first_node     => g_bsn_latency_first_node,
       g_nof_aligners_max           => c_nof_aligners_max,
       g_block_size                 => g_block_size,
       g_bsn_w                      => g_bsn_w,
@@ -567,8 +577,8 @@ begin
       dp_rst         => rst,
       dp_clk         => clk,
       -- Control
-      node_index     => node_index_arr(I),
-      stream_en_arr  => stream_en_arr,
+      chain_node_index => chain_node_index_arr(I),
+      stream_en_arr    => stream_en_arr,
       -- Streaming input
       in_sosi_arr    => dut_in_sosi_2arr(I),
       -- Output via streaming DP interface
diff --git a/libraries/base/dp/tb/vhdl/tb_dp_repack_data.vhd b/libraries/base/dp/tb/vhdl/tb_dp_repack_data.vhd
index 1fbc042d83c68b5f5e55817d62faab380e2ca0c7..4a09164eaf1339f29b58c733de09d3c67e1da91b 100644
--- a/libraries/base/dp/tb/vhdl/tb_dp_repack_data.vhd
+++ b/libraries/base/dp/tb/vhdl/tb_dp_repack_data.vhd
@@ -54,17 +54,17 @@ entity tb_dp_repack_data is
     g_flow_control_stimuli   : t_dp_flow_control_enum := e_active;  -- always e_active, e_random or e_pulse flow control
     g_flow_control_verify    : t_dp_flow_control_enum := e_active;  -- always e_active, e_random or e_pulse flow control
     -- specific
-    g_in_dat_w               : natural := 8 * 42;
-    g_in_nof_words           : natural := 1;
-    g_pack_dat_w             : natural := 32;
-    g_pack_nof_words         : natural := 11;
-    g_in_bypass              : boolean := true;  -- can use TRUE when g_in_nof_words=1  or g_in_nof_words=g_out_nof_words
+    g_in_dat_w               : natural := 36;
+    g_in_nof_words           : natural := 16;
+    g_pack_dat_w             : natural := 64;
+    g_pack_nof_words         : natural := 9;
+    g_in_bypass              : boolean := false;  -- can use TRUE when g_in_nof_words=1  or g_in_nof_words=g_out_nof_words
     g_pack_bypass            : boolean := false;  -- can use TRUE when g_out_nof_words=1 or g_in_nof_words=g_out_nof_words
-    g_in_symbol_w            : natural := 8;  -- default 1 for snk_in.empty  in nof bits, else use power of 2
-    g_pack_symbol_w          : natural := 8;  -- default 1 for src_out.empty in nof bits, else use power of 2
+    g_in_symbol_w            : natural := 1;  -- default 1 for snk_in.empty  in nof bits, else use power of 2
+    g_pack_symbol_w          : natural := 1;  -- default 1 for src_out.empty in nof bits, else use power of 2
     g_nof_repeat             : natural := 10;
-    g_pkt_len                : natural := 1;  -- if not a multiple of g_in_nof_words then the input stage flush creates gap between blocks
-    g_pkt_gap                : natural := 0
+    g_pkt_len                : natural := 64;  -- if not a multiple of g_in_nof_words then the input stage flush creates gap between blocks
+    g_pkt_gap                : natural := 10
   );
 end tb_dp_repack_data;
 
diff --git a/libraries/base/dp/tb/vhdl/tb_mmp_dp_bsn_align_v2.vhd b/libraries/base/dp/tb/vhdl/tb_mmp_dp_bsn_align_v2.vhd
index 78ff1107922be99fcc03ddd829a454cf448692a1..79f281d6e3b7c08fa3f913d698379b7d3620e966 100644
--- a/libraries/base/dp/tb/vhdl/tb_mmp_dp_bsn_align_v2.vhd
+++ b/libraries/base/dp/tb/vhdl/tb_mmp_dp_bsn_align_v2.vhd
@@ -63,6 +63,7 @@ architecture tb of tb_mmp_dp_bsn_align_v2 is
   -- . for dp_bsn_align_v2
   constant c_nof_streams                : natural := 5;
   constant c_bsn_latency_max            : natural := 1;
+  constant c_bsn_latency_first_node     : natural := c_bsn_latency_max;
   constant c_nof_aligners_max           : positive := 1;  -- fixed in this tb
   constant c_block_size                 : natural := 11;
   constant c_block_period               : natural := 11;
@@ -103,8 +104,10 @@ architecture tb of tb_mmp_dp_bsn_align_v2 is
   constant c_gap_size                   : natural := c_block_period - c_block_size;
 
   -- DUT latency
+  constant c_ref_sosi_latency           : natural := 0;
   constant c_mm_to_dp_latency           : natural := 1;
-  constant c_dut_latency                : natural := c_pipeline_input + c_rd_latency + c_mm_to_dp_latency + c_pipeline_output;
+  constant c_dut_latency                : natural := c_pipeline_input + c_ref_sosi_latency +
+                                                     c_rd_latency + c_mm_to_dp_latency + c_pipeline_output;
 
   constant c_align_latency_nof_blocks   : natural := c_bsn_latency_max * c_nof_aligners_max;  -- in number blocks
   constant c_align_latency_nof_valid    : natural := c_bsn_latency_max * c_nof_aligners_max * c_block_size;  -- in number of data samples
@@ -147,7 +150,7 @@ architecture tb of tb_mmp_dp_bsn_align_v2 is
   signal dp_clk                   : std_logic := '1';
   signal dp_rst                   : std_logic := '1';
 
-  signal node_index               : natural := 0;
+  signal chain_node_index         : natural := 0;
   signal ref_siso_arr             : t_dp_siso_arr(c_nof_streams - 1 downto 0) := (others => c_dp_siso_rdy);
   signal ref_sosi_arr             : t_dp_sosi_arr(c_nof_streams - 1 downto 0);  -- generated stimuli
   signal in_sosi_arr              : t_dp_sosi_arr(c_nof_streams - 1 downto 0) := (others => c_dp_sosi_rst);  -- input stimuli
@@ -427,6 +430,7 @@ begin
   generic map (
     g_nof_streams                => c_nof_streams,
     g_bsn_latency_max            => c_bsn_latency_max,
+    g_bsn_latency_first_node     => c_bsn_latency_first_node,
     g_nof_aligners_max           => c_nof_aligners_max,
     g_block_size                 => c_block_size,
     g_bsn_w                      => c_bsn_w,
@@ -456,7 +460,7 @@ begin
     dp_rst                  => dp_rst,
     dp_clk                  => dp_clk,
 
-    node_index              => node_index,
+    chain_node_index        => chain_node_index,
     -- Streaming input
     in_sosi_arr             => in_sosi_arr,
     -- Output via local MM in dp_clk domain
diff --git a/libraries/base/dp/tb/vhdl/tb_tb_dp_bsn_align_v2.vhd b/libraries/base/dp/tb/vhdl/tb_tb_dp_bsn_align_v2.vhd
index d1ac6698b74d433a03b75d037ee651dafc42690b..8c9a3bdc2ba244940a8a6b7c6018170d3e611b81 100644
--- a/libraries/base/dp/tb/vhdl/tb_tb_dp_bsn_align_v2.vhd
+++ b/libraries/base/dp/tb/vhdl/tb_tb_dp_bsn_align_v2.vhd
@@ -23,8 +23,9 @@
 -- > as 3
 -- > run -all
 
-library IEEE;
+library IEEE, common_lib;
 use IEEE.std_logic_1164.all;
+use common_lib.common_pkg.all;
 use work.tb_dp_pkg.all;
 
 entity tb_tb_dp_bsn_align_v2 is
@@ -33,13 +34,14 @@ end tb_tb_dp_bsn_align_v2;
 architecture tb of tb_tb_dp_bsn_align_v2 is
   constant c_block                : natural := 11;
   constant c_period               : natural := 20;
-  constant c_nof_blk              : natural := 30;
+  constant c_nof_blk              : natural := 20;  -- choose > circular buffer size
 
   signal tb_end : std_logic := '0';  -- declare tb_end to avoid 'No objects found' error on 'when -label tb_end'
 begin
   -- -- DUT
   -- g_nof_streams                : NATURAL := 2;      -- number of input and output streams
   -- g_bsn_latency_max            : NATURAL := 1;      -- Maximum travel latency of a remote block in number of block periods T_blk
+  -- g_bsn_latency_first_node     : natural := 1;
   -- g_nof_aligners_max           : NATURAL := 1;      -- 1 when only align at last node, > 1 when align at every intermediate node
   -- g_block_size                 : NATURAL := 11;     -- > 1, g_block_size=1 is not supported
   -- g_block_period               : NATURAL := 20;     -- >= g_block_size, = g_block_size + c_gap_size
@@ -61,25 +63,34 @@ begin
   -- g_tb_nof_restart       : NATURAL := 1;       -- number of times to restart the input stimuli
   -- g_tb_nof_blocks        : NATURAL := 10       -- number of input blocks per restart
 
-  u_mm_output               : entity work.tb_dp_bsn_align_v2 generic map (2, 1, 1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    true, 0, 0, 1,  0, 2, c_nof_blk);
-  u_mm_output_pow2          : entity work.tb_dp_bsn_align_v2 generic map (2, 1, 1,      16, c_period, 32, 16, 17, 0, 0,  0, 3,    true, 0, 0, 1,  0, 2, c_nof_blk);  -- g_block_size = 2**4 = 16
-  u_mm_output_large_bsn     : entity work.tb_dp_bsn_align_v2 generic map (2, 1, 1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3000, true, 0, 0, 1,  0, 2, c_nof_blk);  -- test where bsn * g_block_size > 2^10 to test address resizing
-  u_mm_output_single        : entity work.tb_dp_bsn_align_v2 generic map (1, 1, 1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    true, 0, 0, 1,  0, 2, c_nof_blk);
-  u_dp_output               : entity work.tb_dp_bsn_align_v2 generic map (2, 1, 1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
-  u_dp_output_pow2          : entity work.tb_dp_bsn_align_v2 generic map (2, 1, 1,      16, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);  -- g_block_size = 2**4 = 16
-  u_dp_output_large_bsn     : entity work.tb_dp_bsn_align_v2 generic map (2, 1, 1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3000, false, 0, 0, 1,  0, 2, c_nof_blk);  -- test where bsn * g_block_size > 2^10 to test address resizing
-  u_dp_output_single        : entity work.tb_dp_bsn_align_v2 generic map (1, 1, 1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
-  u_dp_output_p1            : entity work.tb_dp_bsn_align_v2 generic map (2, 1, 1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 1, 1, 1,  0, 2, c_nof_blk);
-  u_bsn_lat_max_2           : entity work.tb_dp_bsn_align_v2 generic map (2, 2, 1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
-  u_bsn_lat_max_3           : entity work.tb_dp_bsn_align_v2 generic map (2, 3, 1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
-  u_p1_rd2                  : entity work.tb_dp_bsn_align_v2 generic map (2, 1, 1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 1, 0, 2,  0, 2, c_nof_blk);
-  u_zero_gap                : entity work.tb_dp_bsn_align_v2 generic map (2, 1, 1, c_block,  c_block, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
-  u_zero_gap_p1_rd2         : entity work.tb_dp_bsn_align_v2 generic map (2, 1, 1, c_block,  c_block, 32, 16, 17, 0, 0,  0, 3,    false, 1, 1, 2,  0, 2, c_nof_blk);
-  u_stream_disable          : entity work.tb_dp_bsn_align_v2 generic map (3, 1, 1, c_block, c_period, 32, 16, 17, 2, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
-  u_stream_lost             : entity work.tb_dp_bsn_align_v2 generic map (3, 1, 1, c_block, c_period, 32, 16, 17, 0, 2,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
-  u_stream_disable_lost     : entity work.tb_dp_bsn_align_v2 generic map (4, 1, 1, c_block, c_period, 32, 16, 17, 1, 2,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
-  u_bsn_lost                : entity work.tb_dp_bsn_align_v2 generic map (3, 1, 1, c_block, c_period, 32, 16, 17, 0, 0, 10, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
-  u_diff_delay              : entity work.tb_dp_bsn_align_v2 generic map (3, 1, 1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1, -1, 2, c_nof_blk);
-  u_nof_aligners            : entity work.tb_dp_bsn_align_v2 generic map (2, 1, 8, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
-  u_nof_aligners_diff_delay : entity work.tb_dp_bsn_align_v2 generic map (4, 1, 3, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1, -1, 2, c_nof_blk);
+  u_mm_output               : entity work.tb_dp_bsn_align_v2 generic map (2, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    true,  0, 0, 1,  0, 2, c_nof_blk);
+  u_mm_output_single        : entity work.tb_dp_bsn_align_v2 generic map (1, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    true,  0, 0, 1,  0, 2, c_nof_blk);
+  u_output                  : entity work.tb_dp_bsn_align_v2 generic map (2, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
+  u_output_single           : entity work.tb_dp_bsn_align_v2 generic map (1, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
+  u_output_pipe1            : entity work.tb_dp_bsn_align_v2 generic map (2, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 1, 1, 1,  0, 2, c_nof_blk);
+  u_pipe1_rdlat2            : entity work.tb_dp_bsn_align_v2 generic map (2, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 1, 0, 2,  0, 2, c_nof_blk);
+  u_zero_gap                : entity work.tb_dp_bsn_align_v2 generic map (2, c_1, c_1,  1, c_block,  c_block, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
+  u_zero_gap_pipe1_rdlat2   : entity work.tb_dp_bsn_align_v2 generic map (2, c_1, c_1,  1, c_block,  c_block, 32, 16, 17, 0, 0,  0, 3,    false, 1, 1, 2,  0, 2, c_nof_blk);
+  u_stream_disable          : entity work.tb_dp_bsn_align_v2 generic map (3, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 2, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
+  u_stream_lost             : entity work.tb_dp_bsn_align_v2 generic map (3, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 0, 2,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
+  u_stream_disable_lost     : entity work.tb_dp_bsn_align_v2 generic map (4, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 1, 2,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
+  u_bsn_lost                : entity work.tb_dp_bsn_align_v2 generic map (3, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 0, 0, 10, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
+  u_diff_delay              : entity work.tb_dp_bsn_align_v2 generic map (3, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1, -1, 2, c_nof_blk);
+
+  -- g_block_size = 2**4 = 16
+  u_mm_block_pow2           : entity work.tb_dp_bsn_align_v2 generic map (2, c_1, c_1,  1,      16, c_period, 32, 16, 17, 0, 0,  0, 3,    true,  0, 0, 1,  0, 2, c_nof_blk);
+  u_block_pow2              : entity work.tb_dp_bsn_align_v2 generic map (2, c_1, c_1,  1,      16, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
+
+  -- test where bsn * g_block_size > 2^10 to test address resizing
+  u_mm_large_bsn            : entity work.tb_dp_bsn_align_v2 generic map (2, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3000, true,  0, 0, 1,  0, 2, c_nof_blk);
+  u_large_bsn               : entity work.tb_dp_bsn_align_v2 generic map (2, c_1, c_1,  1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3000, false, 0, 0, 1,  0, 2, c_nof_blk);
+
+  -- BSN latency
+  u_bsn_lat_max_2           : entity work.tb_dp_bsn_align_v2 generic map (2,   2,   2,  1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
+  u_bsn_lat_max_3           : entity work.tb_dp_bsn_align_v2 generic map (2,   3,   3,  1, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2, c_nof_blk);
+
+  -- chain of aligners
+  u_nof_aligners_16         : entity work.tb_dp_bsn_align_v2 generic map (2,   2, c_1, 16, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2,       100);
+  u_nof_aligners_8          : entity work.tb_dp_bsn_align_v2 generic map (2, c_1, c_1,  8, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1,  0, 2,        50);
+  u_nof_aligners_diff_delay : entity work.tb_dp_bsn_align_v2 generic map (4, c_1, c_1,  3, c_block, c_period, 32, 16, 17, 0, 0,  0, 3,    false, 0, 0, 1, -1, 2,        50);
 end tb;