diff --git a/libraries/dsp/bf/src/vhdl/bf_unit.vhd b/libraries/dsp/bf/src/vhdl/bf_unit.vhd new file mode 100644 index 0000000000000000000000000000000000000000..507ee9897fbf66dd3865ce9a01d0f73019dccbd6 --- /dev/null +++ b/libraries/dsp/bf/src/vhdl/bf_unit.vhd @@ -0,0 +1,459 @@ +------------------------------------------------------------------------------- +-- +-- Copyright (C) 2011 +-- ASTRON (Netherlands Institute for Radio Astronomy) <http://www.astron.nl/> +-- P.O.Box 2, 7990 AA Dwingeloo, The Netherlands +-- +-- This program is free software: you can redistribute it and/or modify +-- it under the terms of the GNU General Public License as published by +-- the Free Software Foundation, either version 3 of the License, or +-- (at your option) any later version. +-- +-- This program is distributed in the hope that it will be useful, +-- but WITHOUT ANY WARRANTY; without even the implied warranty of +-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-- GNU General Public License for more details. +-- +-- You should have received a copy of the GNU General Public License +-- along with this program. If not, see <http://www.gnu.org/licenses/>. +-- +------------------------------------------------------------------------------- + +-- Purpose: The bf_unit implements a beamformer algorithm. The algorithm takes a +-- subband sample of all antenna inputs and then calculates all the beamlets +-- for these subband samples by feeding the multiplier with the according weight +-- factors. A set of ss_wide instantiations is used for data distribution over +-- the multipliers and is used to retreive the same subband sample multiple +-- times. +-- +-- The bf_unit connects the memory with the weightfactors and the output of the +-- ss_wide selection unit to the complex multiplier. The output of the multiplier +-- is fed to a an adder tree for accumulation. After accumulation the data is +-- passed on to two quantizers in parallel. First quantizer is used to shape +-- the data for the beamlet statistics unit. Second quantizer is used to shape +-- the data for the "normal" output that can be passed on to for instance a correlator. +-- +-- The weight-memories can be pre-initialized for simulation using .hex files. The +-- naming convention for these files is: +-- +-- weights_x_y.hex +-- +-- where "weights" is the generic g_bf_weights_file_name +-- "x" is the bf_unit number +-- "y" is the signal path numner. +-- +-- RAM init files (.hex files) only work when g_weights_write_only is set to FALSE. +-- +LIBRARY IEEE; +USE IEEE.std_logic_1164.ALL; +USE IEEE.numeric_std.ALL; +LIBRARY common_lib, dp_lib, st_lib, ss_lib; +USE common_lib.common_pkg.ALL; +USE common_lib.common_mem_pkg.ALL; +USE dp_lib.dp_stream_pkg.ALL; +USE st_lib.ALL; +USE ss_lib.ALL; +USE work.bf_pkg.ALL; + +ENTITY bf_unit IS + GENERIC ( + g_bf : t_c_bf := c_bf; + g_bf_weights_file_name : STRING := "UNUSED"; -- "UNUSED" or relative path to e.g. the bf/build/data/weights hex file for adr_w=8 and dat_w=32 + g_ss_wide_file_prefix : STRING := "UNUSED"; -- path_to_file + g_weights_write_only : BOOLEAN := FALSE -- When set to TRUE the M9K blocks are forced to Simple Dual Port mode. When FALSE it is True Dual Port. + ); + PORT ( + -- System + dp_rst : IN STD_LOGIC := '0'; + dp_clk : IN STD_LOGIC; + mm_rst : IN STD_LOGIC; + mm_clk : IN STD_LOGIC; + + -- MM interface + ram_ss_ss_wide_mosi : IN t_mem_mosi := c_mem_mosi_rst; + ram_ss_ss_wide_miso : OUT t_mem_miso := c_mem_miso_rst; + ram_bf_weights_mosi : IN t_mem_mosi := c_mem_mosi_rst; + ram_bf_weights_miso : OUT t_mem_miso := c_mem_miso_rst; + ram_st_sst_mosi : IN t_mem_mosi := c_mem_mosi_rst; -- Power statistics memory + ram_st_sst_miso : OUT t_mem_miso := c_mem_miso_rst; + reg_st_sst_mosi : IN t_mem_mosi := c_mem_mosi_rst; -- Power statistics register + reg_st_sst_miso : OUT t_mem_miso := c_mem_miso_rst; + + -- ST interface + in_sosi_arr : IN t_dp_sosi_arr(g_bf.nof_input_streams-1 DOWNTO 0); -- subbands; one or more signal paths per datapath input stream + in_siso_arr : OUT t_dp_siso_arr(g_bf.nof_input_streams-1 DOWNTO 0); + out_raw_sosi : OUT t_dp_sosi; -- original raw beamlets output with c_sum_w bits. + out_bst_sosi : OUT t_dp_sosi; -- requantized 16b beamlets output that is also used for internal BST. + out_qua_sosi : OUT t_dp_sosi -- requantized 8b beamlets output. + ); +END bf_unit; + +ARCHITECTURE str OF bf_unit IS + + -- Operational mode: + CONSTANT c_unit_w : POSITIVE := g_bf.in_dat_w + g_bf.in_weight_w - c_sign_w; -- skip double sign bit + CONSTANT c_prod_w : POSITIVE := c_unit_w + c_sum_of_prod_w; -- keep bit for sum of products in complex multiply + CONSTANT c_gain_w : INTEGER := largest(g_bf.bst_gain_w, g_bf.out_gain_w); -- keep internal c_sum_w as wide as necessary to fit both BST and qua output + -- no need to account for adder bit growth of ceil_log2(g_bf.nof_signal_paths), + -- because default BF sum should not clip to allow next stage of BF + CONSTANT c_sum_w : POSITIVE := c_unit_w + c_gain_w; -- note use c_gain_w >= 1 if complex sum of products bit growth has to be preserved + CONSTANT c_bst_lsb_w : NATURAL := c_unit_w + g_bf.bst_gain_w - g_bf.bst_dat_w; + CONSTANT c_out_lsb_w : NATURAL := c_unit_w + g_bf.out_gain_w - g_bf.out_dat_w; + + CONSTANT c_conjugate : BOOLEAN := FALSE; + CONSTANT c_nof_signal_paths_per_stream : POSITIVE := g_bf.nof_signal_paths / g_bf.nof_input_streams; + CONSTANT c_nof_subbands_per_stream : POSITIVE := c_nof_signal_paths_per_stream * g_bf.nof_subbands; + CONSTANT c_xst_enable : BOOLEAN := TRUE; + + CONSTANT c_weights_buf : t_c_mem := (latency => 1, + adr_w => ceil_log2(g_bf.nof_weights), + dat_w => c_nof_complex*g_bf.in_weight_w, + nof_dat => g_bf.nof_weights, + init_sl => '0'); + -- Latencies + CONSTANT c_input_latency : NATURAL := 1; -- due to r + CONSTANT c_prod_latency : NATURAL := 3; + CONSTANT c_adder_stage_latency : NATURAL := 1; + CONSTANT c_nof_adder_stages : NATURAL := ceil_log2(g_bf.nof_signal_paths); + CONSTANT c_adder_tree_latency : NATURAL := c_nof_adder_stages * c_adder_stage_latency; + CONSTANT c_bf_prod_latency : NATURAL := c_input_latency + c_prod_latency; + CONSTANT c_bf_sum_latency : NATURAL := c_bf_prod_latency + c_adder_tree_latency; + CONSTANT c_bf_unit_latency : NATURAL := c_bf_sum_latency; + + TYPE t_slv_data_in_arr IS ARRAY (INTEGER RANGE <>) OF STD_LOGIC_VECTOR(g_bf.in_dat_w-1 DOWNTO 0); + TYPE t_slv_weight_in_arr IS ARRAY (INTEGER RANGE <>) OF STD_LOGIC_VECTOR(g_bf.in_weight_w-1 DOWNTO 0); + TYPE t_slv_prod_arr IS ARRAY (INTEGER RANGE <>) OF STD_LOGIC_VECTOR(c_prod_w-1 DOWNTO 0); + + TYPE reg_type IS RECORD + bf_in_sosi_arr : t_dp_sosi_arr( g_bf.nof_signal_paths-1 DOWNTO 0); + END RECORD; + + SIGNAL mult_miso_arr : t_mem_miso_arr(g_bf.nof_signal_paths-1 DOWNTO 0) := (others => c_mem_miso_rst); -- MM interfaces between weights-memory and complex multipliers. + + SIGNAL r, rin : reg_type; + + SIGNAL ss_wide_in_sosi_arr : t_dp_sosi_arr( g_bf.nof_signal_paths-1 DOWNTO 0); -- for each signal path a datapath stream interface. + SIGNAL ss_wide_in_siso_arr : t_dp_siso_arr( g_bf.nof_signal_paths-1 DOWNTO 0); -- for each signal path a datapath stream interface. + SIGNAL bf_in_sosi_arr : t_dp_sosi_arr( g_bf.nof_signal_paths-1 DOWNTO 0); -- for each signal path a datapath stream interface. + + SIGNAL ram_ss_ss_wide_mosi_arr : t_mem_mosi_arr(g_bf.nof_input_streams-1 DOWNTO 0); + SIGNAL ram_ss_ss_wide_miso_arr : t_mem_miso_arr(g_bf.nof_input_streams-1 DOWNTO 0); + + SIGNAL mm_weight_mosi_arr : t_mem_mosi_arr(g_bf.nof_signal_paths-1 DOWNTO 0); -- for each input a mm interface for writing the weight factors + SIGNAL mm_weight_miso_arr : t_mem_miso_arr(g_bf.nof_signal_paths-1 DOWNTO 0) := (OTHERS => c_mem_miso_rst); -- for each input a mm interface for writing the weight factors + + SIGNAL data_re_arr : t_slv_data_in_arr( g_bf.nof_signal_paths-1 DOWNTO 0); + SIGNAL data_im_arr : t_slv_data_in_arr( g_bf.nof_signal_paths-1 DOWNTO 0); + + SIGNAL weight_addr : STD_LOGIC_VECTOR(c_weights_buf.adr_w-1 DOWNTO 0); -- Address for the weight factors memory + SIGNAL weight_re_arr : t_slv_weight_in_arr(g_bf.nof_signal_paths-1 DOWNTO 0); + SIGNAL weight_im_arr : t_slv_weight_in_arr(g_bf.nof_signal_paths-1 DOWNTO 0); + + SIGNAL prod_re_arr : t_slv_prod_arr( g_bf.nof_signal_paths-1 DOWNTO 0); + SIGNAL prod_im_arr : t_slv_prod_arr( g_bf.nof_signal_paths-1 DOWNTO 0); + SIGNAL prod_re_vec : STD_LOGIC_VECTOR(c_prod_w*g_bf.nof_signal_paths-1 DOWNTO 0); + SIGNAL prod_im_vec : STD_LOGIC_VECTOR(c_prod_w*g_bf.nof_signal_paths-1 DOWNTO 0); + + SIGNAL sum_re : STD_LOGIC_VECTOR(c_sum_w-1 DOWNTO 0); + SIGNAL sum_im : STD_LOGIC_VECTOR(c_sum_w-1 DOWNTO 0); + + SIGNAL piped_sosi : t_dp_sosi; + SIGNAL beams_raw_sosi : t_dp_sosi; + SIGNAL beams_bst_sosi : t_dp_sosi; + +BEGIN + + ------------------------------------------------------------------------------ + -- Input registers + ------------------------------------------------------------------------------ + comb : PROCESS(r, bf_in_sosi_arr) + VARIABLE v : reg_type; + BEGIN + v := r; + v.bf_in_sosi_arr := bf_in_sosi_arr; + rin <= v; + END PROCESS comb; + + regs : PROCESS(dp_clk) + BEGIN + IF rising_edge(dp_clk) THEN + r <= rin; + END IF; + END PROCESS; + + u_mem_mux_ss_wide : ENTITY common_lib.common_mem_mux + GENERIC MAP ( + g_nof_mosi => g_bf.nof_input_streams, + g_mult_addr_w => ceil_log2(g_bf.nof_weights*c_nof_signal_paths_per_stream) + ) + PORT MAP ( + mosi => ram_ss_ss_wide_mosi, + miso => ram_ss_ss_wide_miso, + mosi_arr => ram_ss_ss_wide_mosi_arr, + miso_arr => ram_ss_ss_wide_miso_arr + ); + + ------------------------------------------------------------------------------ + -- The beamformer unit + ------------------------------------------------------------------------------ + -- A set of ss_wide units is used to distribute the incoming subbands to the data-inputs of the + -- beamformer multipliers. + gen_ss_wide : FOR I IN 0 TO g_bf.nof_input_streams-1 GENERATE + gen_copy_input : FOR J IN 0 TO c_nof_signal_paths_per_stream-1 GENERATE + ss_wide_in_sosi_arr(I*c_nof_signal_paths_per_stream+J) <= in_sosi_arr(I); + END GENERATE; + + in_siso_arr(I) <= ss_wide_in_siso_arr(I*c_nof_signal_paths_per_stream); + + u_ss_wide : ENTITY ss_lib.ss_wide + GENERIC MAP ( + g_wb_factor => c_nof_signal_paths_per_stream, + g_dsp_data_w => g_bf.in_dat_w, + g_nof_ch_in => c_nof_subbands_per_stream, + g_nof_ch_sel => g_bf.nof_weights, + g_select_file_prefix => g_ss_wide_file_prefix + ) + PORT MAP ( + mm_rst => mm_rst, + mm_clk => mm_clk, + dp_rst => dp_rst, + dp_clk => dp_clk, + + -- Memory Mapped + ram_ss_ss_wide_mosi => ram_ss_ss_wide_mosi_arr(I), + ram_ss_ss_wide_miso => ram_ss_ss_wide_miso_arr(I), + + -- Streaming + input_sosi_arr => ss_wide_in_sosi_arr((I+1)*c_nof_signal_paths_per_stream-1 DOWNTO I*c_nof_signal_paths_per_stream), + input_siso_arr => ss_wide_in_siso_arr((I+1)*c_nof_signal_paths_per_stream-1 DOWNTO I*c_nof_signal_paths_per_stream), + output_sosi_arr => bf_in_sosi_arr((I+1)*c_nof_signal_paths_per_stream-1 DOWNTO I*c_nof_signal_paths_per_stream) + ); + END GENERATE; + + -- Combine the internal array of mm interfaces to one array that is connected to the port of bf_unit + u_mem_mux_weight : ENTITY common_lib.common_mem_mux + GENERIC MAP ( + g_nof_mosi => g_bf.nof_signal_paths, + g_mult_addr_w => ceil_log2(g_bf.nof_weights) + ) + PORT MAP ( + mosi => ram_bf_weights_mosi, + miso => ram_bf_weights_miso, + mosi_arr => mm_weight_mosi_arr, + miso_arr => mm_weight_miso_arr + ); + + gen_bf : FOR I IN 0 TO g_bf.nof_signal_paths-1 GENERATE + -- Instantiate a weight factor memory for each input stage: + u_weight_ram : ENTITY common_lib.common_ram_crw_crw + GENERIC MAP ( + g_ram => c_weights_buf, + g_init_file => sel_a_b(g_bf_weights_file_name = "UNUSED", g_bf_weights_file_name, g_bf_weights_file_name & "_" & NATURAL'IMAGE(I) & ".hex"), + g_true_dual_port => NOT(g_weights_write_only) + ) + PORT MAP ( + -- MM side + rst_a => mm_rst, + clk_a => mm_clk, + wr_en_a => mm_weight_mosi_arr(I).wr, + wr_dat_a => mm_weight_mosi_arr(I).wrdata(c_weights_buf.dat_w -1 DOWNTO 0), + adr_a => mm_weight_mosi_arr(I).address(c_weights_buf.adr_w-1 DOWNTO 0), + rd_en_a => mm_weight_mosi_arr(I).rd, + rd_dat_a => mm_weight_miso_arr(I).rddata(c_weights_buf.dat_w -1 DOWNTO 0), + rd_val_a => mm_weight_miso_arr(I).rdval, + -- MULT side + rst_b => dp_rst, + clk_b => dp_clk, + wr_en_b => '0', + wr_dat_b => (others =>'0'), + adr_b => weight_addr, + rd_en_b => '1', + rd_dat_b => mult_miso_arr(I).rddata(c_weights_buf.dat_w-1 DOWNTO 0), + rd_val_b => OPEN + ); + + data_re_arr(I) <= r.bf_in_sosi_arr(I).re(g_bf.in_dat_w-1 DOWNTO 0); + data_im_arr(I) <= r.bf_in_sosi_arr(I).im(g_bf.in_dat_w-1 DOWNTO 0); + + weight_re_arr(I) <= mult_miso_arr(I).rddata( g_bf.in_weight_w-1 DOWNTO 0); + weight_im_arr(I) <= mult_miso_arr(I).rddata(2*g_bf.in_weight_w-1 DOWNTO g_bf.in_weight_w); + + u_multiplier : ENTITY common_lib.common_complex_mult(stratix4) + GENERIC MAP ( + g_in_a_w => g_bf.in_weight_w, + g_in_b_w => g_bf.in_dat_w, + g_out_p_w => c_prod_w, + g_conjugate_b => c_conjugate, + g_pipeline_input => 1, + g_pipeline_product => 0, + g_pipeline_adder => 1, + g_pipeline_output => 1 + ) + PORT MAP ( + rst => dp_rst, + clk => dp_clk, + in_ar => weight_re_arr(I), + in_ai => weight_im_arr(I), + in_br => data_re_arr(I), + in_bi => data_im_arr(I), + out_pr => prod_re_arr(I), + out_pi => prod_im_arr(I) + ); + + -- Map the product array to a vector for the adder tree input + prod_re_vec((I+1)*c_prod_w-1 DOWNTO I*c_prod_w) <= prod_re_arr(I); + prod_im_vec((I+1)*c_prod_w-1 DOWNTO I*c_prod_w) <= prod_im_arr(I); + END GENERATE gen_bf; + + -- One adder tree for the real part + u_adder_tree_re : ENTITY common_lib.common_adder_tree(str) + GENERIC MAP ( + g_representation => "SIGNED", + g_pipeline => c_adder_stage_latency, + g_nof_inputs => g_bf.nof_signal_paths, + g_dat_w => c_prod_w, + g_sum_w => c_sum_w + ) + PORT MAP ( + clk => dp_clk, + in_dat => prod_re_vec, + sum => sum_re + ); + + -- One adder tree for the imaginary part + u_adder_tree_im : ENTITY common_lib.common_adder_tree(str) + GENERIC MAP ( + g_representation => "SIGNED", + g_pipeline => c_adder_stage_latency, + g_nof_inputs => g_bf.nof_signal_paths, + g_dat_w => c_prod_w, + g_sum_w => c_sum_w + ) + PORT MAP ( + clk => dp_clk, + in_dat => prod_im_vec, + sum => sum_im + ); + + ------------------------------------------------------------------------------ + -- Counter used to create addresses for the weight memory + ------------------------------------------------------------------------------ + weight_adrs_cnt : ENTITY common_lib.common_counter + GENERIC MAP( + g_latency => 1, + g_init => 0, + g_width => c_weights_buf.adr_w, + g_max => g_bf.nof_weights-1, + g_step_size => 1 + ) + PORT MAP ( + rst => dp_rst, + clk => dp_clk, + cnt_clr => bf_in_sosi_arr(0).eop, + cnt_en => bf_in_sosi_arr(0).valid, + count => weight_addr + ); + + ------------------------------------------------------------------------------ + -- Pipeline to align the sosi control + ------------------------------------------------------------------------------ + u_dp_pipeline : ENTITY dp_lib.dp_pipeline + GENERIC MAP( + g_pipeline => c_bf_unit_latency + ) + PORT MAP ( + rst => dp_rst, + clk => dp_clk, + -- ST sink + snk_in => bf_in_sosi_arr(0), + -- ST source + src_out => piped_sosi + ); + + PROCESS(piped_sosi, sum_re, sum_im) + BEGIN + beams_raw_sosi <= piped_sosi; + beams_raw_sosi.re <= RESIZE_DP_DSP_DATA(sum_re); + beams_raw_sosi.im <= RESIZE_DP_DSP_DATA(sum_im); + END PROCESS; + + ------------------------------------------------------------------------------ + -- Beamlets output + ------------------------------------------------------------------------------ + -- Pass on raw beams data without any further requantization + out_raw_sosi <= beams_raw_sosi; + + -- Requantize for internal BST and out_bst_sosi output + u_dp_requantize_bst : ENTITY dp_lib.dp_requantize + GENERIC MAP ( + g_complex => TRUE, + g_representation => "SIGNED", + g_lsb_w => c_bst_lsb_w, + g_lsb_round => TRUE, + g_lsb_round_clip => FALSE, + g_msb_clip => FALSE, -- default BF should not clip + g_msb_clip_symmetric => FALSE, + g_gain_w => 0, + g_pipeline_remove_lsb => 1, + g_pipeline_remove_msb => 0, + g_in_dat_w => c_sum_w, + g_out_dat_w => g_bf.bst_dat_w + ) + PORT MAP ( + rst => dp_rst, + clk => dp_clk, + snk_in => beams_raw_sosi, + src_out => beams_bst_sosi, + out_ovr => OPEN + ); + + out_bst_sosi <= beams_bst_sosi; + + -- Requantize for out_qua_sosi output + u_dp_requantize_out : ENTITY dp_lib.dp_requantize + GENERIC MAP ( + g_complex => TRUE, + g_representation => "SIGNED", + g_lsb_w => c_out_lsb_w, + g_lsb_round => TRUE, + g_lsb_round_clip => FALSE, + g_msb_clip => FALSE, -- default BF should not clip + g_msb_clip_symmetric => FALSE, + g_gain_w => 0, + g_pipeline_remove_lsb => 1, + g_pipeline_remove_msb => 0, + g_in_dat_w => c_sum_w, + g_out_dat_w => g_bf.out_dat_w + ) + PORT MAP ( + rst => dp_rst, + clk => dp_clk, + snk_in => beams_raw_sosi, + src_out => out_qua_sosi, + out_ovr => OPEN + ); + + ------------------------------------------------------------------------------ + -- Internal BST + ------------------------------------------------------------------------------ + u_beamlet_statistics : ENTITY st_lib.st_sst + GENERIC MAP( + g_nof_stat => g_bf.nof_weights, + g_xst_enable => c_xst_enable, + g_in_data_w => g_bf.bst_dat_w, + g_stat_data_w => g_bf.stat_data_w, + g_stat_data_sz => g_bf.stat_data_sz + ) + PORT MAP ( + mm_rst => mm_rst, + mm_clk => mm_clk, + dp_rst => dp_rst, + dp_clk => dp_clk, + in_complex => beams_bst_sosi, + ram_st_sst_mosi => ram_st_sst_mosi, + ram_st_sst_miso => ram_st_sst_miso, + reg_st_sst_mosi => reg_st_sst_mosi, + reg_st_sst_miso => reg_st_sst_miso + ); + +END str;