diff --git a/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd b/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd
index 1cb352790fe78b5c37bf5d1370d272f94e77a6ad..a82e866ed5c39e03ee904c47301ce44b9ec919e2 100644
--- a/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd
+++ b/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd
@@ -125,6 +125,7 @@ ARCHITECTURE rtl OF dp_bsn_align_v2 IS
     rd_pointer        : INTEGER;  -- use integer to detect need to wrap to natural
     rd_offset         : STD_LOGIC_VECTOR(c_mem_ram.adr_w-1 DOWNTO 0);
     rd_copi           : t_mem_copi;
+    out_sosi_arr      : t_dp_sosi_arr(g_nof_streams-1 DOWNTO 0);  -- used to hold bsn, otherwise combinatorial
   END RECORD;
 
   CONSTANT c_reg_rst  : t_reg := (0,
@@ -136,19 +137,26 @@ ARCHITECTURE rtl OF dp_bsn_align_v2 IS
                                   c_dp_sosi_rst,
                                   0,
                                   (OTHERS=>'0'),
-                                  c_mem_copi_rst);
+                                  c_mem_copi_rst,
+                                  (OTHERS=>c_dp_sosi_rst));
 
   -- State registers
   SIGNAL r             : t_reg;
   SIGNAL nxt_r         : t_reg;
 
+  -- Wires
+  SIGNAL dp_done       : STD_LOGIC;
+  SIGNAL dp_done_arr   : STD_LOGIC_VECTOR(g_nof_streams-1 DOWNTO 0);
   SIGNAL dp_copi       : t_mem_copi;
-  SIGNAL dp_cipo_arr   : t_mem_cipo_arr(g_nof_streams-1 DOWNTO 0);
+  SIGNAL dp_copi_arr   : t_mem_copi_arr(g_nof_streams-1 DOWNTO 0);
+
+  SIGNAL dp_sosi       : t_dp_sosi;
+  SIGNAL rd_sosi_arr   : t_dp_sosi_arr(g_nof_streams-1 DOWNTO 0);
+  SIGNAL rd_cipo_arr   : t_mem_cipo_arr(g_nof_streams-1 DOWNTO 0) := (OTHERS=>c_mem_cipo_rst);
 
   -- Pipeline registers
   SIGNAL in_sosi_arr_p : t_dp_sosi_arr(g_nof_streams-1 DOWNTO 0);
   SIGNAL rd_copi_p     : t_mem_copi;
-  SIGNAL rd_cipo_arr   : t_mem_cipo_arr(g_nof_streams-1 DOWNTO 0) := (OTHERS=>c_mem_cipo_rst);
 
 BEGIN
 
@@ -163,10 +171,10 @@ BEGIN
     END IF;
   END PROCESS;
   
-  p_comb : PROCESS(r, in_sosi_arr_p, mm_copi, rd_cipo_arr)
+  p_comb : PROCESS(r, in_sosi_arr_p, mm_copi, rd_cipo_arr, rd_sosi_arr)
     -- State variable
     VARIABLE v : t_reg;
-    -- Auxiliary variables
+    -- Auxiliary variables / local wires / no memory
     VARIABLE v_ref_sosi          : t_dp_sosi;
     VARIABLE v_pointer_slv       : STD_LOGIC_VECTOR(c_blk_pointer_w-1 DOWNTO 0);
     VARIABLE v_product_slv       : STD_LOGIC_VECTOR(c_product_w-1 DOWNTO 0);
@@ -250,32 +258,54 @@ BEGIN
     END IF;
 
     -- p_read
-    -- . adjust the rd address
-    IF g_use_mm_output THEN
-      v.rd_copi := mm_copi;  -- do output via MM interface
-      v.rd_copi.address := RESIZE_MEM_ADDRESS(ADD_UVEC(r.rd_offset, mm_copi.address));  -- sum yields c_mem_ram.adr_w bits, because left operand determines width
-    ELSE
-      v.rd_copi := dp_copi;  -- do output via DP streaming interface
-      v.rd_copi.address := RESIZE_MEM_ADDRESS(ADD_UVEC(r.rd_offset, dp_copi.address));  -- sum yields c_mem_ram.adr_w bits, because left operand determines width
-    END IF;
-
-    -- . output the rd data
+    -- . prepare the rd data
     v_rd_cipo_arr := rd_cipo_arr;  -- default use input data
     FOR I IN 0 TO g_nof_streams-1 LOOP
       IF r.use_filler_data(I) = '1' THEN
         v_rd_cipo_arr(I).rddata := TO_MEM_SDATA(g_filler_value);  -- replace by filler data
       END IF;
     END LOOP;
+
     IF g_use_mm_output THEN
-      mm_cipo_arr <= v_rd_cipo_arr;  -- output via MM interface
+      -- . adjust the rd address
+      v.rd_copi := mm_copi;  -- do output via MM interface
+      v.rd_copi.address := RESIZE_MEM_ADDRESS(ADD_UVEC(r.rd_offset, mm_copi.address));  -- sum yields c_mem_ram.adr_w bits, because left operand determines width
+
+      -- output via MM interface
+      mm_cipo_arr <= v_rd_cipo_arr;
     ELSE
-      dp_cipo_arr <= v_rd_cipo_arr;  -- output via DP streaming interface
+      -- . adjust the rd address
+      v.rd_copi := dp_copi;  -- do output via DP streaming interface
+      v.rd_copi.address := RESIZE_MEM_ADDRESS(ADD_UVEC(r.rd_offset, dp_copi.address));  -- sum yields c_mem_ram.adr_w bits, because left operand determines width
+
+      -- . hold mm_sosi.sync, bsn
+      IF r.mm_sosi.sop = '1' THEN
+        dp_sosi <= r.mm_sosi;
+      END IF;
+
+      -- apply mm_sosi.sync, bsn at sop to all streams in out_sosi_arr
+      v.out_sosi_arr := rd_sosi_arr;
+      IF rd_sosi_arr(0).sop = '1' THEN
+        v.out_sosi_arr := func_dp_stream_arr_set(v.out_sosi_arr, dp_sosi.sync, "SYNC");
+        v.out_sosi_arr := func_dp_stream_arr_set(v.out_sosi_arr, dp_sosi.bsn, "BSN");
+      ELSE
+        -- hold sosi.bsn until next sop, to easy view in wave window
+        FOR I IN 0 TO g_nof_streams-1 LOOP
+          v.out_sosi_arr(I).bsn := r.out_sosi_arr(I).bsn;
+        END LOOP;
+      END IF;
+
+      -- output via DP streaming interface
+      out_sosi_arr <= v.out_sosi_arr;
     END IF;
 
     -- next state
     nxt_r <= v;
   END PROCESS;
 
+  ------------------------------------------------------------------------------
+  -- Circular buffers
+  ------------------------------------------------------------------------------
 
   gen_streams : FOR I IN 0 TO g_nof_streams-1 GENERATE
     u_data_buffer : ENTITY common_lib.common_ram_r_w
@@ -295,8 +325,47 @@ BEGIN
     );
   END GENERATE;
 
+  ------------------------------------------------------------------------------
+  -- MM to streaming DP
+  ------------------------------------------------------------------------------
+  gen_use_mm_output : IF g_use_mm_output GENERATE
+
+  END GENERATE;
+
+  gen_streaming_output : IF NOT g_use_mm_output GENERATE
+    dp_copi <= dp_copi_arr(0);
+    dp_done <= dp_done_arr(0);   -- for viewing only
+
+    gen_mm_to_dp : FOR I IN 0 TO g_nof_streams-1 GENERATE
+      u_mm_to_dp: ENTITY work.dp_block_from_mm
+      GENERIC MAP (
+        g_data_size          => 1,
+        g_step_size          => 1,
+        g_nof_data           => g_block_size,
+        g_data_w             => g_data_w,
+        g_mm_rd_latency      => g_rd_latency,
+        g_reverse_word_order => FALSE
+      )
+      PORT MAP (
+        rst           => dp_rst,
+        clk           => dp_clk,
+        start_pulse   => r.mm_sosi.sop,
+        start_address => 0,
+        mm_done       => dp_done_arr(I),
+        mm_mosi       => dp_copi_arr(I),
+        mm_miso       => rd_cipo_arr(I),
+        out_sosi      => rd_sosi_arr(I),
+        out_siso      => c_dp_siso_rdy
+      );
+    END GENERATE;
 
+  END GENERATE;
+
+
+  ------------------------------------------------------------------------------
   -- Pipelining
+  ------------------------------------------------------------------------------
+
   -- . input
   u_in_sosi_arr_p : ENTITY work.dp_pipeline_arr
   GENERIC MAP (
diff --git a/libraries/base/dp/tb/vhdl/tb_dp_bsn_align_v2.vhd b/libraries/base/dp/tb/vhdl/tb_dp_bsn_align_v2.vhd
index c8721bd6fb3a58dba987952d5bfab8680666cb0f..d490525aa1bc999cc9f969062226c29ad5a3b941 100644
--- a/libraries/base/dp/tb/vhdl/tb_dp_bsn_align_v2.vhd
+++ b/libraries/base/dp/tb/vhdl/tb_dp_bsn_align_v2.vhd
@@ -44,6 +44,7 @@ ENTITY tb_dp_bsn_align_v2 IS
     g_bsn_w                      : NATURAL := c_dp_stream_bsn_w;  -- number of bits in sosi BSN
     g_data_w                     : NATURAL := 16;     -- number of bits in sosi data
     g_filler_value               : INTEGER := 0;      -- output sosi data value for missing input blocks
+    g_use_mm_output              : BOOLEAN := FALSE;   -- output via MM or via streaming DP
     g_pipeline_input             : NATURAL := 1;      -- >= 0, choose 0 for wires, choose 1 to ease timing closure
     g_rd_latency                 : NATURAL := 2;      -- 1 or 2, choose 2 to ease timing closure
 
@@ -86,7 +87,14 @@ ARCHITECTURE tb OF tb_dp_bsn_align_v2 IS
   TYPE t_rl_vec_arr  IS ARRAY (g_nof_streams-1 DOWNTO 0) OF STD_LOGIC_VECTOR(0 TO c_rl);
 
   TYPE t_tb_state IS (s_idle, s_bsn_mis_aligned, s_bsn_aligned, s_small_bsn_diff, s_large_bsn_diff, s_restore_bsn, s_disable_one_input, s_enable_inputs);
-  
+
+  TYPE t_reg IS RECORD
+    -- p_write_arr
+    sync         : STD_LOGIC;
+    bsn          : STD_LOGIC_VECTOR(g_bsn_w-1 DOWNTO 0);
+    out_sosi_arr : t_dp_sosi_arr(g_nof_streams-1 DOWNTO 0);
+  END RECORD;
+
   SIGNAL tb_end            : STD_LOGIC := '0';
   SIGNAL clk               : STD_LOGIC := '1';
   SIGNAL rst               : STD_LOGIC := '1';
@@ -115,6 +123,10 @@ ARCHITECTURE tb OF tb_dp_bsn_align_v2 IS
   SIGNAL mm_sosi           : t_dp_sosi;   -- streaming information that signals that an output block can be read
   SIGNAL mm_done_arr       : STD_LOGIC_VECTOR(g_nof_streams-1 DOWNTO 0);
   SIGNAL mm_done           : STD_LOGIC;
+  SIGNAL dut_sosi_arr      : t_dp_sosi_arr(g_nof_streams-1 DOWNTO 0);
+  SIGNAL tb_sosi_arr       : t_dp_sosi_arr(g_nof_streams-1 DOWNTO 0);
+  SIGNAL r                 : t_reg;
+  SIGNAL nxt_r             : t_reg;
 
   SIGNAL out_siso_arr      : t_dp_siso_arr(g_nof_streams-1 DOWNTO 0) := (OTHERS=>c_dp_siso_rdy);
   SIGNAL out_sosi_arr      : t_dp_sosi_arr(g_nof_streams-1 DOWNTO 0);
@@ -413,7 +425,7 @@ BEGIN
     g_bsn_w                      => g_bsn_w,
     g_data_w                     => g_data_w,
     g_filler_value               => g_filler_value,
-    g_use_mm_output              => TRUE,               -- output via MM or via streaming DP
+    g_use_mm_output              => g_use_mm_output,    -- output via MM or via streaming DP
     g_pipeline_input             => g_pipeline_input,   -- >= 0, choose 0 for wires, choose 1 to ease timing closure
     g_rd_latency                 => g_rd_latency        -- 1 or 2, choose 2 to ease timing closure
   )
@@ -425,41 +437,84 @@ BEGIN
     stream_en_arr  => stream_en_arr,
     -- Streaming input
     in_sosi_arr    => in_sosi_arr,
-    -- Output via local MM in dp_clk domain
+    -- Output via local MM interface in dp_clk domain
     mm_copi        => mm_copi,
     mm_cipo_arr    => mm_cipo_arr,
-    mm_sosi        => mm_sosi
-  );
+    mm_sosi        => mm_sosi,
 
+    -- Output via streaming DP interface
+    out_sosi_arr   => dut_sosi_arr
+  );
 
   ------------------------------------------------------------------------------
   -- MM to streaming DP
   ------------------------------------------------------------------------------
+  no_use_mm_output : IF NOT g_use_mm_output GENERATE
+    out_sosi_arr <= dut_sosi_arr;
+  END GENERATE;
+
+  gen_use_mm_output : IF g_use_mm_output GENERATE
+    mm_copi <= mm_copi_arr(0);
+    mm_done <= mm_done_arr(0);   -- for viewing only
+
+    gen_mm_to_dp : FOR I IN 0 TO g_nof_streams-1 GENERATE
+      u_mm_to_dp: ENTITY work.dp_block_from_mm
+      GENERIC MAP (
+        g_data_size          => 1,
+        g_step_size          => 1,
+        g_nof_data           => g_block_size,
+        g_data_w             => g_data_w,
+        g_mm_rd_latency      => g_rd_latency,
+        g_reverse_word_order => FALSE
+      )
+      PORT MAP (
+        rst           => rst,
+        clk           => clk,
+        start_pulse   => mm_sosi.sop,
+        start_address => 0,
+        mm_done       => mm_done_arr(I),
+        mm_mosi       => mm_copi_arr(I),
+        mm_miso       => mm_cipo_arr(I),
+        out_sosi      => tb_sosi_arr(I),
+        out_siso      => c_dp_siso_rdy
+      );
+    END GENERATE;
+
+    p_comb : PROCESS(r, mm_sosi, tb_sosi_arr)
+      VARIABLE v : t_reg;
+    BEGIN
+      v := r;
+
+      -- hold mm_sosi.sync, bsn
+      IF mm_sosi.sop = '1' THEN
+        v.sync := mm_sosi.sync;
+        v.bsn  := mm_sosi.bsn;
+      END IF;
+
+      -- apply mm_sosi.sync, bsn at sop to all streams in out_sosi_arr
+      v.out_sosi_arr := tb_sosi_arr;
+      IF tb_sosi_arr(0).sop = '1' THEN
+        v.out_sosi_arr := func_dp_stream_arr_set(v.out_sosi_arr, r.sync, "SYNC");
+        v.out_sosi_arr := func_dp_stream_arr_set(v.out_sosi_arr, r.bsn, "BSN");
+      ELSE
+        -- hold sosi.bsn until next sop, to easy view in wave window
+        FOR I IN 0 TO g_nof_streams-1 LOOP
+          v.out_sosi_arr(I).bsn := r.out_sosi_arr(I).bsn;
+        END LOOP;
+      END IF;
+
+      -- next state
+      nxt_r <= v;
+    END PROCESS;
 
-  gen_mm_to_dp : FOR I IN 0 TO g_nof_streams-1 GENERATE
-    u_mm_to_dp: ENTITY work.dp_block_from_mm
-    GENERIC MAP (
-      g_data_size          => 1,
-      g_step_size          => 1,
-      g_nof_data           => g_block_size,
-      g_data_w             => g_data_w,
-      g_mm_rd_latency      => g_rd_latency,
-      g_reverse_word_order => FALSE
-    )
-    PORT MAP (
-      rst           => rst,
-      clk           => clk,
-      start_pulse   => mm_sosi.sop,
-      start_address => 0,
-      mm_done       => mm_done_arr(I),
-      mm_mosi       => mm_copi_arr(I),
-      mm_miso       => mm_cipo_arr(I),
-      out_sosi      => out_sosi_arr(I),
-      out_siso      => c_dp_siso_rdy
-    );
+    p_reg : PROCESS(clk)
+    BEGIN
+      IF rising_edge(clk) THEN
+        r <= nxt_r;
+      END IF;
+    END PROCESS;
+
+    out_sosi_arr <= nxt_r.out_sosi_arr;
   END GENERATE;
 
-  mm_copi <= mm_copi_arr(0);
-  mm_done <= mm_done_arr(0);
-  
 END tb;