diff --git a/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd b/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd
index b3ea7dc99f01d6c8054451961578a59f3327db7d..158b828fe417e7bad8f75a0bc9fbcc93c1573be4 100644
--- a/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd
+++ b/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd
@@ -70,7 +70,7 @@ ENTITY dp_bsn_align_v2 IS
 
     -- Output via local MM interface in dp_clk domain
     mm_copi        : IN  t_mem_copi;  -- read access to output block, all output streams share same mm_copi
-    mm_cipo_arr    : OUT t_mem_copi_arr(g_nof_streams-1 DOWNTO 0);
+    mm_cipo_arr    : OUT t_mem_cipo_arr(g_nof_streams-1 DOWNTO 0);
     mm_sosi        : OUT t_dp_sosi   -- streaming information that signals that an output block can be read
   );
 END dp_bsn_align_v2;
@@ -78,12 +78,51 @@ END dp_bsn_align_v2;
 
 ARCHITECTURE rtl OF dp_bsn_align_v2 IS
 
+  -- Circular buffer per stream
+  CONSTANT c_ram_size       : NATURAL := g_buffer_nof_blocks * g_block_size;
+  CONSTANT c_ram_buf        : t_c_mem := (latency  => 1,
+                                          adr_w    => ceil_log2(c_ram_size),
+                                          dat_w    => g_data_w,
+                                          nof_dat  => c_ram_size,
+                                          init_sl  => '0');
+
+  CONSTANT c_block_size_w   : NATURAL := ceil_log2(g_block_size);
+  CONSTANT c_block_size_slv : STD_LOGIC_VECTOR(c_block_size_w-1 DOWNTO 0) := TO_UVEC(g_block_size, c_block_size_w);
+  CONSTANT c_blk_pointer_w  : NATURAL := ceil_log2(g_buffer_nof_blocks);
+
+  -- Use fixed slv width instead of using naturals for address calculation, to
+  -- avoid that synthesis may infer a too larger multiplier
+  CONSTANT c_product_w      : NATURAL := c_blk_pointer_w + c_block_size_w;
+
+  TYPE t_bsn_arr IS ARRAY (INTEGER RANGE <>) OF STD_LOGIC_VECTOR(g_bsn_w-1 DOWNTO 0);
+  TYPE t_adr_arr IS ARRAY (INTEGER RANGE <>) OF STD_LOGIC_VECTOR(c_mem_ram.adr_w-1 DOWNTO 0);
+  TYPE t_filled_arr IS ARRAY (INTEGER RANGE <>) OF STD_LOGIC_VECTOR(g_buffer_nof_blocks-1 DOWNTO 0);
+
   TYPE t_reg IS RECORD
-    a           : STD_LOGIC;
-    b           : NATURAL;
+    -- p_write_arr
+    wr_pointer        : NATURAL;  -- only for debugging
+    wr_copi_arr       : t_mem_copi_arr(g_nof_streams-1 DOWNTO 0);
+    -- all streams
+    filled_arr        : t_filled_arr(g_nof_streams-1 DOWNTO 0);
+    -- local reference
+    sync_arr          : STD_LOGIC_VECTOR(g_buffer_nof_blocks-1 DOWNTO 0);
+    bsn_arr           : t_bsn_arr(g_buffer_nof_blocks-1 DOWNTO 0);
+    mm_sosi           : t_dp_sosi;
+    -- p_read
+    rd_pointer        : NATURAL;  -- only for debugging
+    rd_offset         : STD_LOGIC_VECTOR(c_mem_ram.adr_w-1 DOWNTO 0);
+    rd_copi           : t_mem_copi;
   END RECORD;
 
-  CONSTANT c_reg_rst  : t_reg := ('0', 0);
+  CONSTANT c_reg_rst  : t_reg := (0,
+                                  (OTHERS=>c_mem_copi_rst),
+                                  (OTHERS=>(OTHERS=>'0')),
+                                  (OTHERS=>'0'),
+                                  (OTHERS=>(OTHERS=>'0')),
+                                  c_dp_sosi_rst,
+                                  0,
+                                  (OTHERS=>'0'),
+                                  c_mem_copi_rst);
 
   -- Local registers
   SIGNAL r            : t_reg;
@@ -91,6 +130,8 @@ ARCHITECTURE rtl OF dp_bsn_align_v2 IS
 
 BEGIN
 
+  mm_sosi <= r.mm_sosi;
+
   p_clk: PROCESS(dp_clk, dp_rst)
   BEGIN
     IF dp_rst='1' THEN
@@ -100,7 +141,107 @@ BEGIN
     END IF;
   END PROCESS;
   
+  p_comb : PROCESS(r, in_sosi_arr, mm_copi)
+    -- State variable
+    VARIABLE v : t_reg;
+    -- Auxiliary variables
+    VARIABLE v_ref_sosi    : t_dp_sosi;
+    VARIABLE v_pointer     : INTEGER;
+    VARIABLE v_pointer_slv : STD_LOGIC_VECTOR(c_blk_pointer_w-1 DOWNTO 0);
+    VARIABLE v_product_slv : STD_LOGIC_VECTOR(c_product_w-1 DOWNTO 0);
+  BEGIN
+    v := r;
+    v.mm_sosi.sop :=  '0';
+    v.mm_sosi.eop :=  '0';
+    v.mm_sosi.valid := '0';
+    v.mm_sosi.sync := '0';
+
+    -- p_write_arr
+    FOR I IN 0 TO g_nof_streams-1 LOOP
+      -- p_write
+      IF in_sosi_arr(I).valid = '1' THEN
+        -- Increment address or start at block
+        v.wr_copi_arr(I).address := RESIZE_MEM_ADDRESS(INCR_UVEC(r.wr_copi_arr(I).address(c_mem_ram.adr_w-1 DOWNTO 0), 1));
+        IF in_sosi_arr(I).sop = '1' THEN
+          v_pointer_slv := in_sosi_arr(I).bsn(c_blk_pointer_w-1 DOWNTO 0);
+          v_product_slv := STD_LOGIC_VECTOR(UNSIGNED(v_pointer_slv) * UNSIGNED(c_block_size_slv));
+          v.wr_copi_arr(I).address := RESIZE_MEM_ADDRESS(v_product_slv);
+        END IF;
+        v.wr_copi_arr(I).wr := '1';
+        v.wr_copi_arr(I).wrdata := RESIZE_MEM_SDATA(in_sosi_arr(I).data);
+      END IF;
+
+      -- Set filled flag
+      v_pointer := TO_UINT(v_pointer_slv);
+      IF in_sosi_arr(I).sop = '1' THEN  -- at sop, so assume rest will follow in time
+        v.filled_arr(I)(v_pointer) := '1';
+      END IF;
+
+    END LOOP;
+
+    -- p_control, all at local reference input 0 sop
+    v_ref_sosi := in_sosi_arr(0);
+    IF v_ref_sosi.sop = '1' THEN
+      -- . write sync & bsn buffer
+      v_pointer := TO_UINT(v_ref_sosi.bsn(c_blk_pointer_w-1 DOWNTO 0));
+      v.sync_arr(v_pointer) := v_ref_sosi.sync;
+      v.bsn_arr(v_pointer) := v_ref_sosi.bsn(g_bsn_w-1 DOWNTO 0);
+      v.wr_pointer := v_pointer;
+
+      -- . update read pointer at g_bsn_latency_max blocks behind the reference write pointer
+      v_pointer := v_pointer - g_bsn_latency_max;
+      IF v_pointer < 0 THEN
+        v_pointer := v_pointer + g_buffer_nof_blocks;
+      END IF;
+      v.rd_pointer := v_pointer;
+
+      -- . update read address
+      v_pointer_slv := TO_UVEC(v_pointer, c_blk_pointer_w);
+      v_product_slv := STD_LOGIC_VECTOR(UNSIGNED(v_pointer_slv) * UNSIGNED(c_block_size_slv));
+      v.rd_offset := RESIZE_UVEC(v_product_slv, c_mem_ram.adr_w);
+
+      -- . issue mm_sosi, if there is output ready to be read, indicated by filled reference block
+      IF r.filled_arr(0)(v_pointer) = '1' THEN
+        v.mm_sosi.sop := '1';
+        v.mm_sosi.eop := '1';
+        v.mm_sosi.valid := '1';
+        -- . pass on timestamp information
+        v.mm_sosi.sync := v.sync_arr(v_pointer);
+        v.mm_sosi.bsn := v.bsn_arr(v_pointer);
+        -- . use channel field to pass on filled flags
+        FOR I IN 0 TO g_nof_streams-1 LOOP
+          v.mm_sosi.channel(I) := v.filled_arr(I)(v_pointer);
+        END LOOP;
+      END IF;
+
+      -- . clear filled flags, after mm_sosi was issued, or could have been issued
+      FOR I IN 0 TO g_nof_streams-1 LOOP
+        v.filled_arr(I)(v_pointer) := '0';
+      END LOOP;
+    END IF;
+
+    -- p_read
+    v.rd_copi := mm_copi;
+    v.rd_copi.address := RESIZE_MEM_ADDRESS(ADD_UVEC(r.rd_offset, mm_copi.address));  -- sum yields c_mem_ram.adr_w bits, because left operand determines width
+  END PROCESS;
 
 
+  gen_streams : FOR I IN 0 TO g_nof_streams-1 GENERATE
+    u_data_buffer : ENTITY common_lib.common_ram_r_w
+    GENERIC MAP (
+      g_ram     => c_ram_buf
+    )
+    PORT MAP (
+      rst       => dp_rst,
+      clk       => dp_clk,
+      wr_en     => r.wr_copi_arr(I).wr,
+      wr_adr    => r.wr_copi_arr(I).address(c_ram_buf.adr_w-1 DOWNTO 0),
+      wr_dat    => r.wr_copi_arr(I).wrdata(c_ram_buf.dat_w-1 DOWNTO 0),
+      rd_en     => nxt_r.rd_copi.rd,
+      rd_adr    => nxt_r.rd_copi.address(c_ram_buf.adr_w-1 DOWNTO 0),  -- use nxt_r to not increase the rd latency
+      rd_dat    => mm_cipo_arr(I).rddata(c_ram_buf.dat_w-1 DOWNTO 0),
+      rd_val    => mm_cipo_arr(I).rdval
+    );
+  END GENERATE;
 
 END rtl;