diff --git a/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd b/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd
index d4242feaebc959a9d0e792e2416a2f3cb1ec3c88..27447448cb25d3e196dfb262c3f0f332e86be09e 100644
--- a/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd
+++ b/libraries/base/dp/src/vhdl/dp_bsn_align_v2.vhd
@@ -20,36 +20,36 @@
 -- Purpose :
 --   Align frames from multiple input streams
 -- Description:
---   The aligner uses a circular buffer to capture the blocks that arrive at
---   the input streams. The blocks have a block sequence number (BSN) that
---   is used to align the inputs. The input stream 0 is treated as local
---   input stream that is ahead of the other remote input streams. After a
---   certain number of blocks on input 0, the same block on all remote
---   inputs should also have arrived. If not then they are replaced by
---   replacement data. The output streams are paced by the block rate of
---   input 0. The user has to read the block within the block period.
+--   Aligner:
+--   . The aligner uses a circular buffer to capture the blocks that arrive at
+--     the input streams. The blocks have a block sequence number (BSN) that
+--     is used to align the inputs. The input stream 0 is treated as local
+--     input stream that is ahead of the other remote input streams. After a
+--     certain number of blocks on input 0, the same block on all remote
+--     inputs should also have arrived. If not then they are replaced by
+--     replacement data. The output streams are paced by the block rate of
+--     input 0. The user has to read the block within the block period.
+--   . The aligner can align g_nof_streams that all arrive within a latency
+--     of g_bsn_latency_max after the local stream at index 0. The aligner
+--     can also be used in a chain of aligners, whereby each aligner typically
+--     has the local input and one remote input and the remote input is the
+--     output of an upstream aligner. Then the latency on the last node in
+--     the chain will be within g_nof_aligners_max * g_bsn_latency_max.
 --
---   The aligner can align g_nof_streams that all arrive within a latency
---   of g_bsn_latency_max after the local stream at index 0. The aligner
---   can also be used in a chain of aligners, whereby each aligner typically
---   has the local input and one remote input and the remote input is the
---   output of an upstream aligner. Then the latency on the last node in
---   the chain will be within g_nof_aligners_max * g_bsn_latency_max.
---
---   The size of the circular buffer is c_buffer_nof_blocks and depends on the
---   maximum latency. The c_buffer_nof_blocks has to a power of two to ease
---   the control of the circular buffer. The lowest bits of the input block
---   sequence number (BSN) are used as write block index into the circular
---   buffer. The g_bsn_latency_first_node can be useful to reduce the
---   required circular buffer size just enough, such that the next power of two
---   is only a feq blocks larger, instead of almost a factor two larger. This
---   then may save a significant amount of block RAM.
---
---   In case of a chain of aligners then the circular buffer size depends on
---   the latency of local input. The most remote input will only use a
---   fraction of the buffer. Therefore more block RAM can be saved by using
---   a smaller circular buffer size for signal inputs that are from more
---   remote (i.e. that have passed through more upstream aligners).
+--   Circular buffer:
+--   . The size of the circular buffer is c_buffer_nof_blocks and depends on
+--     the maximum latency. The c_buffer_nof_blocks has to a power of two to
+--     ease the control of the circular buffer. The lowest bits of the input
+--     block sequence number (BSN) are used as write block index into the
+--     circular buffer. The g_bsn_latency_first_node can be useful to reduce
+--     the required circular buffer size just enough, such that the next power
+--     of two is only a few blocks larger, instead of almost a factor two
+--     larger. This then can save a significant amount of block RAM.
+--   . In case of a chain of aligners then the circular buffer size depends on
+--     the latency of local input. The most remote input will only use a
+--     fraction of the buffer. Therefore more block RAM can be saved by using
+--     a smaller circular buffer size for signal inputs that are from more
+--     remote (i.e. that have passed through more upstream aligners).
 --
 --   Features:
 --   . The g_block_size <= block period, so supports input blocks arriving
@@ -213,7 +213,7 @@ architecture rtl of dp_bsn_align_v2 is
   -- . For unique representation as signal wire, the p_comb should assign each
   --   field in t_comb only once to a variable. It is allowed to reasign a
   --   t_comb variable in p_comb, but then only the last assignment value will
-  --   be visible via the signal dbg_wires in the Wave window.
+  --   be visible via the signal w_comb in the Wave window.
   type t_comb is record
     blk_pointer_slv     : std_logic_vector(c_blk_pointer_w - 1 downto 0);
     product_slv         : std_logic_vector(c_product_w - 1 downto 0);
@@ -248,7 +248,7 @@ architecture rtl of dp_bsn_align_v2 is
   signal nxt_r             : t_reg;
 
   -- Memoryless signals in p_comb (wires used as local variables)
-  signal dbg_wires         : t_comb;
+  signal w_comb            : t_comb;
 
   -- Structural signals (wires used to connect components and IO)
   signal dp_done           : std_logic;
@@ -333,11 +333,11 @@ begin
     -- p_control, all at sop of local reference input 0
     ---------------------------------------------------------------------------
     v.ref_sosi := in_sosi_arr_p(0);
-    -- Use r.ref_sosi.sop, that occurs one cycle after in_sosi_arr_p(I).sop,
-    -- to support immediate aligner output when g_use_aligner_at_first_node =
-    -- false. While the local block of chain_node_index = 0 is written into
-    -- the circular buffer, then it can already be read from the circular
-    -- buffer one dp_clk cycle later.
+    -- Use v.ref_sosi.sop instead of r.ref_sosi.sop, to support alignment of
+    -- streams that have no data valid gap between blocks, so when
+    -- g_block_size is equal to the block period or when shorter blocks have
+    -- jitter in arrival time that could cause two blocks to arrive without a
+    -- gap.
     if v.ref_sosi.sop = '1' then
       -- . write sync & bsn buffer
       v.wr_blk_pointer := TO_UINT(v.ref_sosi.bsn(c_blk_pointer_w - 1 downto 0));
@@ -365,6 +365,10 @@ begin
       v.rd_offset := RESIZE_UVEC(w.product_slv, c_ram_buf.adr_w);
 
       -- . issue mm_sosi, if there is output ready to be read, indicated by filled reference block
+      --   - can use 'if r.filled_arr(0)' instead of 'if v.filled_arr(0)',
+      --     because input stream 0 arrives first, so is already filled
+      --   - need to use 'not v.filled_arr(I)' for w.lost_data_flags_arr(I),
+      --     because last input I = g_nof_streams - 1 may just got filled.
       if r.filled_arr(0)(v.rd_blk_pointer) = '1' then
         v.mm_sosi.sop := '1';
         v.mm_sosi.eop := '1';
@@ -413,7 +417,7 @@ begin
       -- Do the output via the MM interface
       --------------------------------------------------------------------------
       -- . adjust the rd address to the current buffer output block
-      --   sum yields c_ram_buf.adr_w bits, because left operand in ADD_UVECdetermines width
+      --   sum yields c_ram_buf.adr_w bits, because left operand in ADD_UVEC determines width
       v.rd_copi := mm_copi;
       v.rd_copi.address := RESIZE_MEM_ADDRESS(ADD_UVEC(r.rd_offset, mm_copi.address));
 
@@ -427,7 +431,7 @@ begin
       -- Do the output via the DP streaming interface
       --------------------------------------------------------------------------
       -- . adjust the rd address
-      --   sum yields c_ram_buf.adr_w bits, because left operand in ADD_UVECdetermines width
+      --   sum yields c_ram_buf.adr_w bits, because left operand in ADD_UVEC determines width
       v.rd_copi := dp_copi;
       v.rd_copi.address := RESIZE_MEM_ADDRESS(ADD_UVEC(r.rd_offset, dp_copi.address));
 
@@ -475,7 +479,7 @@ begin
     nxt_r <= v;
 
     -- local wires, only for view in wave window
-    dbg_wires <= w;
+    w_comb <= w;
   end process;
 
   ------------------------------------------------------------------------------