dp_bsn_align_v2.vhd

-- --------------------------------------------------------------------------
-- Copyright 2021
-- ASTRON (Netherlands Institute for Radio Astronomy) <http://www.astron.nl/>
-- P.O.Box 2, 7990 AA Dwingeloo, The Netherlands
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
-- --------------------------------------------------------------------------
--
-- Author: Eric Kooistra, 3 Sept 2021
-- Purpose :
--   Align frames from multiple input streams
-- Description:
--   Aligner:
--   . The aligner uses a circular buffer to capture the blocks that arrive at
--     the input streams. The blocks have a block sequence number (BSN) that
--     is used to align the inputs. The input stream 0 is treated as local
--     input stream that is ahead of the other remote input streams. After a
--     certain number of blocks on input 0, the same block on all remote
--     inputs should also have arrived. If not then they are replaced by
--     replacement data. The output streams are paced by the block rate of
--     input 0. The user has to read the block within the block period.
--   . The aligner can align g_nof_streams that all arrive within a latency
--     of g_bsn_latency_max after the local stream at index 0. The aligner
--     can also be used in a chain of aligners, whereby each aligner typically
--     has the local input and one remote input and the remote input is the
--     output of an upstream aligner. Then the latency on the last node in
--     the chain will be within g_nof_aligners_max * g_bsn_latency_max.
--
--   Circular buffer:
--   . The size of the circular buffer is c_buffer_nof_blocks and depends on
--     the maximum latency. The c_buffer_nof_blocks has to a power of two to
--     ease the control of the circular buffer. The lowest bits of the input
--     block sequence number (BSN) are used as write block index into the
--     circular buffer. The g_bsn_latency_first_node can be useful to reduce
--     the required circular buffer size just enough, such that the next power
--     of two is only a few blocks larger, instead of almost a factor two
--     larger. This then can save a significant amount of block RAM.
--     For example: The circular buffer size c_buffer_nof_blocks is 1 + the
--     sum of bsn latencies at each node. Therefor if g_nof_aligners_max = 16
--     (a power of two) and g_bsn_latency_max = 2, then the circular buffer
--     becomes true_log_pow2(1 + 16 * 2) = 64 blocks, so almost twice as large
--     as needed. If the first input stream does not have active remote input,
--     or is disabled via stream_en_arr, then choose g_bsn_latency_first_node
--     = 1, to get a buffer size of true_log_pow2(1 + 15 * 2 + 1) = 32 blocks.
--   . In case of a chain of aligners then the circular buffer size depends on
--     the latency of local input. The most remote input will only use a
--     fraction of the buffer. Therefore more block RAM can be saved by using
--     a smaller circular buffer size for signal inputs that are from more
--     remote (i.e. that have passed through more upstream aligners).
--
--   Features:
--   . The g_block_size <= block period, so supports input blocks arriving
--     with or without data valid gaps
--   . uses replacement data to replace lost input blocks and channel bit 0 as
--     lost_data flag
--   . uses replacement data to replace disabled input streams
--   . output block can be read in arbitrary order via g_use_mm_output = true
--   . output block can be streamed via g_use_mm_output = false
--
--   Parameters:
--   . g_nof_streams: number of input and output streams. Stream index 0 is
--     the local stream. Streams index > 0 is for remote streams. The
--     remote streams arrive later than the local stream, but within
--     g_bsn_latency_max or within an integer multiple of g_bsn_latency_max.
--   . g_bsn_latency_max: >= 1, maximum travel latency of a remote block in
--     number of block periods T_blk.
--   . g_bsn_latency_first_node: typically <= g_bsn_latency_max of the other
--     nodes in a chain. Use g_bsn_latency_first_node = 0 for immediate
--     output from first node in a chain of nodes. Only used when
--     g_nof_aligners_max > 1. The g_bsn_latency_first_node setting only
--     affects the latency along the chain, and therefore the required
--     size of the circular buffer. If the circular buffer is large enough
--     anyway, then the g_bsn_latency_first_node setting is don't care,
--     assuming that a little extra latency is don't care.
--   . g_nof_aligners_max: Number of dp_bsn_align_v2 aligners in a chain.
--     = 1 when only align at last node, or
--     > 1 when align at every intermediate node in a chain of nodes, and then
--         g_nof_aligners_max should equal the number of nodes for
--         chain_node_index range. The g_nof_aligners_max is the number of
--         nodes in the chain including the first node.
--
--   Inputs:
--   . chain_node_index: Node index in chain of nodes. First node has index 0.
--     In case of a ring of nodes the chain of nodes can range the whole ring,
--     or only a part of the ring. The number of nodes in the chain is given
--     by g_nof_aligners_max. Only used when g_nof_aligners_max > 1.
--   . stream_en_arr: when '1' then align corresponding input stream, else
--     replace data from corresponding inut stream by 0 and do not raise the
--     lost data flag. Whether a stream is enabled or not has no effect on the
--     aligner timing, it only sets the data to 0.
--
--   Outputs:
--   . replace_cnt_en_arr: count number of lost data blocks per input stream,
--     that got replaced by 0 value, per sync interval.
--
--   For more detailed description see:
--   https://support.astron.nl/confluence/display/L2M/L6+FWLIB+Design+Document%3A+BSN+aligner+v2
--
-- Remarks:
-- . This dp_bsn_align_v2.vhd replaces the dp_bsn_align.vhd that was used in
--   APERTIF. Main differences are that the old component uses FIFO buffers,
--   timeouts and states, and v2 does not, which makes v2 simpler and more
--   robust.
-- . The g_bsn_latency_first_node = 0 should also be feasible, but does not
--   work and is not investigated further, because g_bsn_latency_first_node =
--   1 in combination with g_bsn_latency_max = 2 is sufficient to reduce the
--   circular buffer size when g_nof_aligners_max is a power of two.
-- . Using a circular buffer with optimum size, that does not have to have a
--   power of two number of blocks, makes the circular buffer control and
--   access more complicated and is not investigated further.

library IEEE,common_lib;
use IEEE.std_logic_1164.all;
use IEEE.numeric_std.all;
use common_lib.common_pkg.all;
use common_lib.common_mem_pkg.all;
use work.dp_stream_pkg.all;

entity dp_bsn_align_v2 is
  generic (
    g_nof_streams                : natural := 2;  -- >= 2, number of input and output streams
    g_bsn_latency_max            : natural := 2;  -- >= 1
    g_bsn_latency_first_node     : natural := 2;  -- default use same as g_bsn_latency_max
    g_nof_aligners_max           : positive := 16;
    g_block_size                 : natural := 1024;  -- > 1, g_block_size=1 is not supported
    g_bsn_w                      : natural := c_dp_stream_bsn_w;  -- number of bits in sosi BSN
    g_data_w                     : natural := 36;  -- number of bits in sosi data
    g_data_replacement_value     : integer := 0;  -- output sosi data value for missing input blocks
    g_use_mm_output              : boolean := false;  -- output via MM or via streaming DP
    g_pipeline_input             : natural := 1;  -- >= 0, 0 for wires, 1 to ease timing closure of in_sosi_arr
    g_pipeline_output            : natural := 1;  -- >= 0, 0 for wires, 1 to ease timing closure of out_sosi_arr
    g_rd_latency                 : natural := 2  -- 1 or 2, choose 2 to ease timing closure
  );
  port (
    dp_rst         : in  std_logic;
    dp_clk         : in  std_logic;

    chain_node_index : in  natural range 0 to g_nof_aligners_max - 1 := 0;

    -- MM control
    stream_en_arr            : in  std_logic_vector(g_nof_streams - 1 downto 0) := (others => '1');
    stream_replaced_cnt_arr  : out t_slv_32_arr(g_nof_streams - 1 downto 0);

    -- Streaming input
    in_sosi_arr    : in  t_dp_sosi_arr(g_nof_streams - 1 downto 0);

    -- Output via local MM interface in dp_clk domain, when g_use_mm_output = true
    -- . streaming information that signals that an output block can be read
    mm_sosi        : out t_dp_sosi;
    -- . MM read access to output block, all output streams share same mm_copi
    mm_copi        : in  t_mem_copi := c_mem_copi_rst;
    mm_cipo_arr    : out t_mem_cipo_arr(g_nof_streams - 1 downto 0);

    -- Output via streaming DP interface, when g_use_mm_output = false.
    out_sosi_arr   : out t_dp_sosi_arr(g_nof_streams - 1 downto 0)
  );
end dp_bsn_align_v2;

architecture rtl of dp_bsn_align_v2 is
  -- Circular buffer per stream, size is next power of two that fits
  constant c_buffer_nof_blocks : natural := sel_a_b(g_nof_aligners_max = 1,
           true_log_pow2(1 + g_bsn_latency_max),
           true_log_pow2(1 + g_bsn_latency_max * (g_nof_aligners_max - 1) + g_bsn_latency_first_node));

  constant c_ram_size       : natural := c_buffer_nof_blocks * g_block_size;
  constant c_ram_buf        : t_c_mem := (latency  => 1,
                                          adr_w    => ceil_log2(c_ram_size),
                                          dat_w    => g_data_w,
                                          nof_dat  => c_ram_size,
                                          init_sl  => '0');

  -- Use +1 to ensure that g_block_size that is power of two also fits in c_block_size_slv
  constant c_block_size_w   : natural := ceil_log2(g_block_size + 1);
  constant c_block_size_slv : std_logic_vector(c_block_size_w - 1 downto 0) := TO_UVEC(g_block_size, c_block_size_w);
  constant c_blk_pointer_w  : natural := ceil_log2(c_buffer_nof_blocks);

  -- Use fixed slv width instead of using naturals for address calculation, to
  -- avoid that synthesis may infer a too larger multiplier
  constant c_product_w      : natural := c_blk_pointer_w + c_block_size_w;

  -- Output on lost data flag via out_sosi_arr().channel bit 0
  constant c_channel_w      : natural := 1;

  type t_bsn_arr is array (integer range <>) of std_logic_vector(g_bsn_w - 1 downto 0);
  type t_channel_arr is array (integer range <>) of std_logic_vector(c_channel_w - 1 downto 0);
  type t_adr_arr is array (integer range <>) of std_logic_vector(c_ram_buf.adr_w - 1 downto 0);
  type t_filled_arr is array (integer range <>) of std_logic_vector(c_buffer_nof_blocks - 1 downto 0);

  -- State
  type t_reg is record
    ref_sosi             : t_dp_sosi;
    -- p_write_arr
    wr_blk_pointer       : natural;
    wr_copi_arr          : t_mem_copi_arr(g_nof_streams - 1 downto 0);
    -- all streams
    filled_arr           : t_filled_arr(g_nof_streams - 1 downto 0);
    use_replacement_data : std_logic_vector(g_nof_streams - 1 downto 0);
    -- local reference
    sync_arr             : std_logic_vector(c_buffer_nof_blocks - 1 downto 0);
    bsn_arr              : t_bsn_arr(c_buffer_nof_blocks - 1 downto 0);
    mm_sosi              : t_dp_sosi;
    dp_sosi              : t_dp_sosi;
    -- p_read
    rd_blk_pointer       : integer;  -- use integer to detect need to wrap to natural
    rd_offset            : std_logic_vector(c_ram_buf.adr_w - 1 downto 0);
    rd_copi              : t_mem_copi;
    fill_cipo_arr        : t_mem_cipo_arr(g_nof_streams - 1 downto 0);  -- used combinatorial to contain rd_cipo_arr
                                                                        -- from buffer or replacement data
    out_bsn              : std_logic_vector(g_bsn_w - 1 downto 0);  -- hold BSN until next sop, for easy view in Wave
                                                                    -- window
    out_channel_arr      : t_channel_arr(g_nof_streams - 1 downto 0);  -- hold channel until next sop per stream, for
                                                                       -- easy view in Wave window
    replace_cnt_en_arr   : std_logic_vector(g_nof_streams - 1 downto 0);
  end record;

  -- Wires and auxiliary variables in p_comb
  -- . For unique representation as signal wire, the p_comb should assign each
  --   field in t_comb only once to a variable. It is allowed to reasign a
  --   t_comb variable in p_comb, but then only the last assignment value will
  --   be visible via the signal w_comb in the Wave window.
  type t_comb is record
    blk_pointer_slv     : std_logic_vector(c_blk_pointer_w - 1 downto 0);
    product_slv         : std_logic_vector(c_product_w - 1 downto 0);
    lost_data_flags_arr : std_logic_vector(g_nof_streams - 1 downto 0);
    out_sosi_arr        : t_dp_sosi_arr(g_nof_streams - 1 downto 0);
  end record;

  constant c_reg_rst  : t_reg := (c_dp_sosi_rst,
                                  0,
                                  (others => c_mem_copi_rst),
                                  (others => (others => '0')),
                                  (others => '0'),
                                  (others => '0'),
                                  (others => (others => '0')),
                                  c_dp_sosi_rst,
                                  c_dp_sosi_rst,
                                  0,
                                  (others => '0'),
                                  c_mem_copi_rst,
                                  (others => c_mem_cipo_rst),
                                  (others => '0'),
                                  (others => (others => '0')),
                                  (others => '0'));

  constant c_comb_rst  : t_comb := ((others => '0'),
                                    (others => '0'),
                                    (others => '0'),
                                    (others => c_dp_sosi_rst));

  -- State registers for p_comb
  signal r                 : t_reg;
  signal nxt_r             : t_reg;

  -- Memoryless signals in p_comb (wires used as local variables)
  signal w_comb            : t_comb;

  -- Structural signals (wires used to connect components and IO)
  signal dp_done           : std_logic;
  signal dp_done_arr       : std_logic_vector(g_nof_streams - 1 downto 0);
  signal dp_copi           : t_mem_copi;
  signal dp_copi_arr       : t_mem_copi_arr(g_nof_streams - 1 downto 0);

  signal rd_sosi_arr       : t_dp_sosi_arr(g_nof_streams - 1 downto 0);
  signal rd_cipo_arr       : t_mem_cipo_arr(g_nof_streams - 1 downto 0) := (others => c_mem_cipo_rst);

  -- Pipeline registers
  signal in_sosi_arr_p     : t_dp_sosi_arr(g_nof_streams - 1 downto 0);
  signal rd_copi           : t_mem_copi;
  signal comb_out_sosi_arr : t_dp_sosi_arr(g_nof_streams - 1 downto 0);
  -- Counter signals
  signal replace_cnt_arr          : t_slv_32_arr(g_nof_streams - 1 downto 0);
  signal nxt_hold_replace_cnt_arr : t_slv_32_arr(g_nof_streams - 1 downto 0);
  signal hold_replace_cnt_arr     : t_slv_32_arr(g_nof_streams - 1 downto 0);

  -- Debug signals
  signal dbg_nof_streams            : natural := g_nof_streams;
  signal dbg_bsn_latency_max        : natural := g_bsn_latency_max;
  signal dbg_nof_aligners_max       : natural := g_nof_aligners_max;
  signal dbg_block_size             : natural := g_block_size;
  signal dbg_bsn_w                  : natural := g_bsn_w;
  signal dbg_data_w                 : natural := g_data_w;
  signal dbg_data_replacement_value : integer := g_data_replacement_value;
  signal dbg_use_mm_output          : boolean := g_use_mm_output;
  signal dbg_pipeline_input         : natural := g_pipeline_input;
  signal dbg_rd_latency             : natural := g_rd_latency;
  signal dbg_c_buffer_nof_blocks    : natural := c_buffer_nof_blocks;
  signal dbg_c_product_w            : natural := c_product_w;
begin
  -- Output mm_sosi, also when g_use_mm_output = FALSE.
  mm_sosi <= r.mm_sosi;

  p_reg : process(dp_clk, dp_rst)
  begin
    if dp_rst = '1' then
      r <= c_reg_rst;
    elsif rising_edge(dp_clk) then
      r <= nxt_r;
    end if;
  end process;

  p_comb : process(r, in_sosi_arr_p, mm_copi, dp_copi, rd_cipo_arr, rd_sosi_arr, stream_en_arr, chain_node_index)
    variable v : t_reg;  -- State variable
    variable w : t_comb;  -- Local wires = memoryless auxiliary variables
  begin
    w := c_comb_rst;
    v := r;  -- state signals
    v.mm_sosi := func_dp_stream_reset_control(r.mm_sosi);
    v.wr_copi_arr := RESET_MEM_COPI_CTRL(r.wr_copi_arr);
    v.replace_cnt_en_arr := (others => '0');

    ----------------------------------------------------------------------------
    -- p_write_arr
    ----------------------------------------------------------------------------
    for I in 0 to g_nof_streams - 1 loop
      -- p_write
      if in_sosi_arr_p(I).valid = '1' then
        -- . increment address during block
        v.wr_copi_arr(I).address := RESIZE_MEM_ADDRESS(INCR_UVEC(r.wr_copi_arr(I).address(c_ram_buf.adr_w - 1 downto 0), 1));
        v.wr_copi_arr(I).wr := '1';
        v.wr_copi_arr(I).wrdata := RESIZE_MEM_SDATA(in_sosi_arr_p(I).data);
      end if;

      if in_sosi_arr_p(I).sop = '1' then
        -- . set address at start of block
        w.blk_pointer_slv := in_sosi_arr_p(I).bsn(c_blk_pointer_w - 1 downto 0);
        w.product_slv := MULT_UVEC(w.blk_pointer_slv, c_block_size_slv);
        -- . resize to c_ram_buf.adr_w
        v.wr_copi_arr(I).address := RESIZE_MEM_ADDRESS(RESIZE_UVEC(w.product_slv, c_ram_buf.adr_w));

        -- . set filled flag at sop, so assume rest of block will follow in time
        v.filled_arr(I)(TO_UINT(w.blk_pointer_slv)) := '1';
      end if;
    end loop;

    ---------------------------------------------------------------------------
    -- p_control, all at sop of local reference input 0
    ---------------------------------------------------------------------------
    v.ref_sosi := in_sosi_arr_p(0);
    -- Use v.ref_sosi.sop instead of r.ref_sosi.sop, to support alignment of
    -- streams that have no data valid gap between blocks, so when
    -- g_block_size is equal to the block period or when shorter blocks have
    -- jitter in arrival time that could cause two blocks to arrive without a
    -- gap.
    if v.ref_sosi.sop = '1' then
      -- . write sync & bsn buffer
      v.wr_blk_pointer := TO_UINT(v.ref_sosi.bsn(c_blk_pointer_w - 1 downto 0));
      v.sync_arr(v.wr_blk_pointer) := v.ref_sosi.sync;
      v.bsn_arr(v.wr_blk_pointer) := v.ref_sosi.bsn(g_bsn_w - 1 downto 0);

      -- . update read block pointer at g_bsn_latency_max blocks behind the
      --   reference write pointer, dependent on the chain_node_index:
      --   - for g_nof_aligners_max = 1 the chain_node_index = 0 fixed
      --   - for g_nof_aligners_max > 1, chain_node_index = 0 is the first BSN
      --     aligner in a chain. Each subsequent node in the chain then has to
      --     account for g_bsn_latency_max additional block latency.
      if g_nof_aligners_max = 1 then
        v.rd_blk_pointer := v.wr_blk_pointer - g_bsn_latency_max;
      else
        v.rd_blk_pointer := v.wr_blk_pointer - g_bsn_latency_max * chain_node_index - g_bsn_latency_first_node;
      end if;
      if v.rd_blk_pointer < 0 then
        v.rd_blk_pointer := v.rd_blk_pointer + c_buffer_nof_blocks;
      end if;

      -- . update read address of read block pointer
      w.blk_pointer_slv := TO_UVEC(v.rd_blk_pointer, c_blk_pointer_w);
      w.product_slv := MULT_UVEC(w.blk_pointer_slv, c_block_size_slv);
      v.rd_offset := RESIZE_UVEC(w.product_slv, c_ram_buf.adr_w);

      -- . issue mm_sosi, if there is output ready to be read, indicated by filled reference block
      --   - can use 'if r.filled_arr(0)' instead of 'if v.filled_arr(0)',
      --     because input stream 0 arrives first, so is already filled
      --   - need to use 'not v.filled_arr(I)' for w.lost_data_flags_arr(I),
      --     because last input I = g_nof_streams - 1 may just got filled.
      if r.filled_arr(0)(v.rd_blk_pointer) = '1' then
        v.mm_sosi.sop := '1';
        v.mm_sosi.eop := '1';
        v.mm_sosi.valid := '1';
        -- . pass on timestamp information
        v.mm_sosi.sync := v.sync_arr(v.rd_blk_pointer);
        v.mm_sosi.bsn := RESIZE_DP_BSN(v.bsn_arr(v.rd_blk_pointer));
        -- . pass on lost data flags for enabled streams via channel field, and
        --   determine whether the ouput has to insert replacement data
        v.mm_sosi.channel := (others => '0');
        for I in 0 to g_nof_streams - 1 loop
          w.lost_data_flags_arr(I) := not v.filled_arr(I)(v.rd_blk_pointer);
          v.replace_cnt_en_arr(I) := w.lost_data_flags_arr(I);
          if stream_en_arr(I) = '1' then  -- use MM bit at sop
            v.use_replacement_data(I) := w.lost_data_flags_arr(I);  -- enabled stream, so replace the data if the data was lost
            v.mm_sosi.channel(I) := w.lost_data_flags_arr(I);  -- enabled stream, so flag the data if the data was lost
          else
            v.use_replacement_data(I) := '1';  -- disabled stream, so replace the data, but do not flag the data as lost
            v.replace_cnt_en_arr(I) := '1';
          end if;
        end loop;
      end if;

      -- . clear filled flags, after mm_sosi was issued, or could have been issued
      for I in 0 to g_nof_streams - 1 loop
        v.filled_arr(I)(v.rd_blk_pointer) := '0';
      end loop;
    end if;

    ----------------------------------------------------------------------------
    -- p_read
    ----------------------------------------------------------------------------

    -- Read the data from the buffer, or replace a block by replacement data
    -- . default use input data from the circular buffer
    v.fill_cipo_arr := rd_cipo_arr;
    -- . if necessary, replace a stream by replacement data
    for I in 0 to g_nof_streams - 1 loop
      if r.use_replacement_data(I) = '1' then
        v.fill_cipo_arr(I).rddata := TO_MEM_SDATA(g_data_replacement_value);
      end if;
    end loop;

    if g_use_mm_output then
      --------------------------------------------------------------------------
      -- Do the output via the MM interface
      --------------------------------------------------------------------------
      -- . adjust the rd address to the current buffer output block
      --   sum yields c_ram_buf.adr_w bits, because left operand in ADD_UVEC determines width
      v.rd_copi := mm_copi;
      v.rd_copi.address := RESIZE_MEM_ADDRESS(ADD_UVEC(r.rd_offset, mm_copi.address));

      -- . output via MM interface
      mm_cipo_arr <= v.fill_cipo_arr;

      -- . no output via DP streaming interface
      comb_out_sosi_arr <= (others => c_dp_sosi_rst);
    else
      --------------------------------------------------------------------------
      -- Do the output via the DP streaming interface
      --------------------------------------------------------------------------
      -- . adjust the rd address
      --   sum yields c_ram_buf.adr_w bits, because left operand in ADD_UVEC determines width
      v.rd_copi := dp_copi;
      v.rd_copi.address := RESIZE_MEM_ADDRESS(ADD_UVEC(r.rd_offset, dp_copi.address));

      -- . hold mm_sosi.sync, bsn, channel
      if r.mm_sosi.sop = '1' then
        v.dp_sosi := r.mm_sosi;
      end if;

      -- . pass on input data from the buffer
      w.out_sosi_arr := rd_sosi_arr;  -- = v.fill_cipo_arr in streaming format, contains the
                                      -- input data from the buffer or replacement data
      if rd_sosi_arr(0).sop = '1' then
        -- . at sop pass on input info from r.dp_sosi to all streams in out_sosi_arr
        w.out_sosi_arr := func_dp_stream_arr_set(w.out_sosi_arr, r.dp_sosi.sync, "SYNC");
        w.out_sosi_arr := func_dp_stream_arr_set(w.out_sosi_arr, r.dp_sosi.bsn, "BSN");
        for I in 0 to g_nof_streams - 1 loop
          -- . pass on the lost flag per stream
          w.out_sosi_arr(I).channel := RESIZE_DP_CHANNEL(slv(r.dp_sosi.channel(I)));
        end loop;

        -- . hold sop info fields until next sop, to ease view in wave window
        v.out_bsn := r.dp_sosi.bsn(g_bsn_w - 1 downto 0);
        for I in 0 to g_nof_streams - 1 loop
          v.out_channel_arr(I) := w.out_sosi_arr(I).channel(c_channel_w - 1 downto 0);
        end loop;
      else
        -- . until next sop pass on BSN to all streams, to ease view in wave window
        w.out_sosi_arr := func_dp_stream_arr_set(w.out_sosi_arr, r.out_bsn, "BSN");
        for I in 0 to g_nof_streams - 1 loop
           -- . until next sop pass on channel bit 0 per stream, to ease view in wave window
           w.out_sosi_arr(I).channel := RESIZE_DP_CHANNEL(r.out_channel_arr(I));
        end loop;
      end if;

      -- . output via DP streaming interface
      comb_out_sosi_arr <= w.out_sosi_arr;

      -- . no output via MM interface
      mm_cipo_arr <= (others => c_mem_cipo_rst);
    end if;

    ----------------------------------------------------------------------------
    -- next state
    ----------------------------------------------------------------------------
    nxt_r <= v;

    -- local wires, only for view in wave window
    w_comb <= w;
  end process;

  ------------------------------------------------------------------------------
  -- Circular buffers
  ------------------------------------------------------------------------------

  gen_data_buffer : for I in 0 to g_nof_streams - 1 generate
    u_data_buffer : entity common_lib.common_ram_r_w
    generic map (
      g_ram     => c_ram_buf
    )
    port map (
      rst       => dp_rst,
      clk       => dp_clk,
      wr_en     => r.wr_copi_arr(I).wr,
      wr_adr    => r.wr_copi_arr(I).address(c_ram_buf.adr_w - 1 downto 0),
      wr_dat    => r.wr_copi_arr(I).wrdata(c_ram_buf.dat_w - 1 downto 0),
      rd_en     => rd_copi.rd,
      rd_adr    => rd_copi.address(c_ram_buf.adr_w - 1 downto 0),
      rd_dat    => rd_cipo_arr(I).rddata(c_ram_buf.dat_w - 1 downto 0),
      rd_val    => rd_cipo_arr(I).rdval
    );
  end generate;

  ------------------------------------------------------------------------------
  -- MM to streaming DP
  ------------------------------------------------------------------------------

  gen_streaming_output : if not g_use_mm_output generate
    gen_mm_to_dp : for I in 0 to g_nof_streams - 1 generate
      u_mm_to_dp: entity work.dp_block_from_mm
      generic map (
        g_user_size          => 1,
        g_data_size          => 1,
        g_step_size          => 1,
        g_nof_data           => g_block_size,
        g_word_w             => g_data_w,
        g_mm_rd_latency      => g_rd_latency,
        g_reverse_word_order => false
      )
      port map (
        rst           => dp_rst,
        clk           => dp_clk,
        start_pulse   => r.mm_sosi.sop,
        start_address => 0,
        mm_done       => dp_done_arr(I),
        mm_mosi       => dp_copi_arr(I),
        mm_miso       => nxt_r.fill_cipo_arr(I),
        out_sosi      => rd_sosi_arr(I),
        out_siso      => c_dp_siso_rdy
      );
    end generate;

    -- Use dp_copi_arr(0) to read same addresses in parallel for all streams
    dp_copi <= dp_copi_arr(0);
    dp_done <= dp_done_arr(0);  -- for viewing only
  end generate;

  ------------------------------------------------------------------------------
  -- Replaced packets Counter
  ------------------------------------------------------------------------------
  gen_cnt_replace : for I in 0 to g_nof_streams - 1 generate
    u_cnt_replace : entity common_lib.common_counter
    generic map (
      g_width => c_word_w
    )
    port map (
      rst     => dp_rst,
      clk     => dp_clk,
      cnt_clr => in_sosi_arr_p(0).sync,
      cnt_en  => r.replace_cnt_en_arr(I),
      count   => replace_cnt_arr(I)
    );
  end generate;

  nxt_hold_replace_cnt_arr <= replace_cnt_arr when in_sosi_arr_p(0).sync = '1' else hold_replace_cnt_arr;
  hold_replace_cnt_arr     <= nxt_hold_replace_cnt_arr when rising_edge(dp_clk);
  stream_replaced_cnt_arr  <= hold_replace_cnt_arr;

  ------------------------------------------------------------------------------
  -- Pipelining
  ------------------------------------------------------------------------------

  -- . input streams
  u_in_sosi_arr_p : entity work.dp_pipeline_arr
  generic map (
    g_nof_streams => g_nof_streams,
    g_pipeline    => g_pipeline_input  -- 0 for wires, > 0 for registers,
  )
  port map (
    rst          => dp_rst,
    clk          => dp_clk,
    -- ST sink
    snk_in_arr   => in_sosi_arr,
    -- ST source
    src_out_arr  => in_sosi_arr_p
  );

  -- . read RAM
  rd_copi <= nxt_r.rd_copi when g_rd_latency = 1 else r.rd_copi;

  -- . output streams
  u_out_sosi_arr_p : entity work.dp_pipeline_arr
  generic map (
    g_nof_streams => g_nof_streams,
    g_pipeline    => g_pipeline_output
  )
  port map (
    rst          => dp_rst,
    clk          => dp_clk,
    -- ST sink
    snk_in_arr   => comb_out_sosi_arr,
    -- ST source
    src_out_arr  => out_sosi_arr
  );
end rtl;