diff --git a/applications/aartfaac/designs/aartfaac_bn_sdo/hdllib.cfg b/applications/aartfaac/designs/aartfaac_bn_sdo/hdllib.cfg
index c2f8f2c1fd3fcb856618029b454dc94492bbc500..f313015ad42c7e841a8000bd0ffc3f8d43c75b64 100644
--- a/applications/aartfaac/designs/aartfaac_bn_sdo/hdllib.cfg
+++ b/applications/aartfaac/designs/aartfaac_bn_sdo/hdllib.cfg
@@ -9,12 +9,24 @@ synth_top_level_entity =
 
 quartus_copy_files =
     src/quartus/sopc_aartfaac_bn_sdo.sopc .
-    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/ hex
-    $SVN/Aartfaac/trunk/Firmware/modules/rsp_terminal/src/hex/ hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/udp_sdo_ss.hex hex
+    src/hex/ hex
 
 modelsim_copy_files = 
-    $SVN/Aartfaac/trunk/Firmware/modules/rsp_terminal/src/hex/ hex
-    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/ hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/subband_dat_0.hex hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/subband_dat_1.hex hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/subband_dat_2.hex hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/subband_dat_3.hex hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/subband_dat_4.hex hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/subband_dat_5.hex hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/subband_dat_6.hex hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/subband_dat_7.hex hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/subband_dat_8.hex hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/subband_dat_9.hex hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/subband_dat_10.hex hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/subband_dat_11.hex hex
+    $SVN/Aartfaac/trunk/Firmware/designs/aartfaac_bn_sdo/src/hex/udp_sdo_ss.hex hex
+    src/hex/ hex
 
 synth_files =
     $HDL_BUILD_DIR/unb1/quartus/aartfaac_bn_sdo/sopc_aartfaac_bn_sdo.vhd
diff --git a/applications/aartfaac/designs/aartfaac_bn_sdo/src/python/gen_hex_files_ss_parallel_sb_16b.py b/applications/aartfaac/designs/aartfaac_bn_sdo/src/python/gen_hex_files_ss_parallel_sb_16b.py
new file mode 100644
index 0000000000000000000000000000000000000000..4924c7fc0125a8f1357f5d86ff3937b78ba40948
--- /dev/null
+++ b/applications/aartfaac/designs/aartfaac_bn_sdo/src/python/gen_hex_files_ss_parallel_sb_16b.py
@@ -0,0 +1,115 @@
+###############################################################################
+#
+# Copyright (C) 2015
+# ASTRON (Netherlands Institute for Radio Astronomy) <http://www.astron.nl/>
+# P.O.Box 2, 7990 AA Dwingeloo, The Netherlands
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+###############################################################################
+
+from common import *
+from mem_init_file import list_to_hex
+import pi_ss_parallel
+
+# Purpose:
+# . Generate the HEX files for the ss_parallel instance.
+# Description:
+# . In 16b mode, no extra reordering takes place: dout=din.
+# . The ss_parallel instance requires 3 types of hex files (14 in total):
+#   . Input reorder stage  : ss_parallel_sb_16b_reorder_in.hex
+#   . Main selection stage : ss_parallel_sb_16b_ss_wide_0.hex
+#                          : ss_parallel_sb_16b_ss_wide_1.hex
+#   . Output reorder stage : ss_parallel_sb_16b_reorder_out.hex
+#
+#  Remark:
+
+NOF_IN = 2
+NOF_INTERNALS = 2
+NOT_OUT = 2 
+FRAME_SIZE_IN = 864 # 12*72 or 9*96
+FRAME_SIZE_OUT = FRAME_SIZE_IN 
+
+HEX_REORDER_IN_MEM_WIDTH = 8 # Actually 1 bit but HEX files require byte boundaries  
+HEX_REORDER_IN_MEM_DEPTH = 1024 # 864 cycles * 1 bit to encode the 2 inputs on each word = 864 regs
+HEX_REORDER_IN_FILE_NAME = "../hex/ss_parallel_sb_16b_reorder_in.hex"
+
+HEX_SS_WIDE_MEM_WIDTH = 10
+HEX_SS_WIDE_MEM_DEPTH = 1024 # size = 864 words each (matches output block size) for each of its 2 internal instances
+HEX_SS_WIDE_FILE_PREFIX = "../hex/ss_parallel_sb_16b_ss_wide_"
+
+HEX_REORDER_OUT_MEM_WIDTH = 8 # Actually 4 bits but HEX files require byte boundaries
+HEX_REORDER_OUT_MEM_DEPTH = 1024 # 864 cycles * 4 bits to encode the 12 inputs on each word = 864 regs
+HEX_REORDER_OUT_FILE_NAME = "../hex/ss_parallel_sb_16b_reorder_out.hex"
+
+# This is not a TC but we must provide a TC to create the object
+import test_case
+dummy_tc = test_case.Testcase('','')
+# We won't be using IO either.
+import node_io
+dummy_io = node_io.NodeIO(dummy_tc.nodeImages, dummy_tc.base_ip)
+
+ss = pi_ss_parallel.PiSsParallel(dummy_tc, dummy_io, NOF_IN, NOF_INTERNALS, NOT_OUT, FRAME_SIZE_IN, FRAME_SIZE_OUT)
+
+# ========================
+# Create the input matrix:
+# ========================
+# stream  0) [ ( 0,0), ( 0,1), .. , ( 0, 70), ( 0,71) ]
+# ..
+# stream 11) [ (11,0), (11,1), .. , (11, 70), (11,71) ]
+din = ss.create_Din()
+
+# =========================
+# Create the output matrix:
+# =========================
+dout = din
+
+# ======================
+# Generate the settings:
+# ======================
+[result, Rin, Dram, Dsel, Rout, Errout] = ss.create_settings(din, dout)
+
+# =====================================================
+# Create the selection buffer values from the settings:
+# =====================================================
+reorder_in_buf  = ss.ssReorderIn.create_selection_buf(Rin)
+select_buf = flatten(Dsel) 
+reorder_out_buf = ss.ssReorderOut.create_selection_buf(Rout)
+
+# ================================
+# Generate hex file: input reorder
+# ================================
+list_to_hex(reorder_in_buf, HEX_REORDER_IN_FILE_NAME, HEX_REORDER_IN_MEM_WIDTH, HEX_REORDER_IN_MEM_DEPTH)
+
+# ===========================
+# Generate hex files: ss_wide
+# ===========================
+# First replace the don't cares (-1) with zeroes for list_to_hex (requires integers)
+for n,i in enumerate(select_buf):
+    if i==-1:
+        select_buf[n]=0
+
+
+# The created select_buf is a flat list meant to MM write to several instances 
+# from a certain offset. However, we want to create a HEX file for each
+# individual instance, so split the list into 2 (sublist size of 864).
+for i, sublist in zip(range(2), split_list(select_buf, 864)):
+    list_to_hex(sublist, HEX_SS_WIDE_FILE_PREFIX+str(i)+".hex", HEX_SS_WIDE_MEM_WIDTH, HEX_SS_WIDE_MEM_DEPTH)
+
+# ==================================
+# Generate hex files: output reorder
+# ==================================
+# The output reorder list contains 864 words of 4 bits, so each word easily 
+# fits within 32b words.
+list_to_hex(reorder_out_buf, HEX_REORDER_OUT_FILE_NAME, HEX_REORDER_OUT_MEM_WIDTH, HEX_REORDER_OUT_MEM_DEPTH)
diff --git a/applications/aartfaac/designs/aartfaac_bn_sdo/src/vhdl/aartfaac_bn_sdo.vhd b/applications/aartfaac/designs/aartfaac_bn_sdo/src/vhdl/aartfaac_bn_sdo.vhd
index 1b28651eb13845f688e3cc0606e44fcc481c1f5a..cf57de8133e2b67ef57dcd5c82ee8978c0507406 100644
--- a/applications/aartfaac/designs/aartfaac_bn_sdo/src/vhdl/aartfaac_bn_sdo.vhd
+++ b/applications/aartfaac/designs/aartfaac_bn_sdo/src/vhdl/aartfaac_bn_sdo.vhd
@@ -219,11 +219,20 @@ ARCHITECTURE str OF aartfaac_bn_sdo IS
   SIGNAL io_rsp_terminal_rsp_snk_in_arr      : t_dp_sosi_arr(c_rsp_terminal_nof_lanes-1 DOWNTO 0);
   SIGNAL io_rsp_terminal_rsp_snk_out_arr     : t_dp_siso_arr(c_rsp_terminal_nof_lanes-1 DOWNTO 0);
   SIGNAL io_rsp_terminal_subband_src_out_arr : t_dp_sosi_arr(c_rsp_terminal_nof_lanes-1 DOWNTO 0);
+
   SIGNAL ss_parallel_sp_snk_out_arr          : t_dp_siso_arr(c_rsp_terminal_nof_lanes-1 DOWNTO 0) := (OTHERS=>c_dp_siso_rdy);
   SIGNAL ss_parallel_sp_snk_in_arr           : t_dp_sosi_arr(c_rsp_terminal_nof_lanes-1 DOWNTO 0);
   SIGNAL ss_parallel_sp_src_out_arr          : t_dp_sosi_arr(c_nof_offload_streams-1 DOWNTO 0);
-  SIGNAL dp_fifo_sc_src_out_arr              : t_dp_sosi_arr(c_nof_offload_streams-1 DOWNTO 0);
-  SIGNAL dp_fifo_sc_src_in_arr               : t_dp_siso_arr(c_nof_offload_streams-1 DOWNTO 0);
+  SIGNAL ss_parallel_sp_fifo_src_out_arr     : t_dp_sosi_arr(c_nof_offload_streams-1 DOWNTO 0);
+  SIGNAL ss_parallel_sp_fifo_src_in_arr      : t_dp_siso_arr(c_nof_offload_streams-1 DOWNTO 0);
+
+  SIGNAL ss_parallel_sb_snk_out_arr          : t_dp_siso_arr(2-1 DOWNTO 0) := (OTHERS=>c_dp_siso_rdy);
+  SIGNAL ss_parallel_sb_snk_in_arr           : t_dp_sosi_arr(2-1 DOWNTO 0);
+  SIGNAL ss_parallel_sb_src_out_arr          : t_dp_sosi_arr(2-1 DOWNTO 0);
+  SIGNAL ss_parallel_sb_fifo_snk_in_arr      : t_dp_sosi_arr(c_nof_offload_streams-1 DOWNTO 0);
+  SIGNAL ss_parallel_sb_fifo_src_out_arr     : t_dp_sosi_arr(c_nof_offload_streams-1 DOWNTO 0);
+  SIGNAL ss_parallel_sb_fifo_src_in_arr      : t_dp_siso_arr(c_nof_offload_streams-1 DOWNTO 0);
+
   SIGNAL aartfaac_bn_sdo_udp_sdo_src_out_arr : t_dp_sosi_arr(c_nof_offload_streams-1 DOWNTO 0);
   SIGNAL aartfaac_bn_sdo_udp_sdo_src_in_arr  : t_dp_siso_arr(c_nof_offload_streams-1 DOWNTO 0);
 
@@ -417,10 +426,10 @@ BEGIN
       g_reorder_out_file_name => "hex/ss_parallel_sp_reorder_out.hex"
     )
     PORT MAP (
-      mm_rst                  =>  mm_rst,
-      mm_clk                  =>  mm_clk,
-      dp_rst                  =>  dp_rst,
-      dp_clk                  =>  dp_clk,   
+      mm_rst                  => mm_rst,
+      mm_clk                  => mm_clk,
+      dp_rst                  => dp_rst,
+      dp_clk                  => dp_clk,   
   
       ram_ss_reorder_in_mosi  => ram_ss_reorder_in_mosi,
       ram_ss_reorder_in_miso  => ram_ss_reorder_in_miso,
@@ -435,9 +444,8 @@ BEGIN
       output_siso_arr         => (OTHERS=>c_dp_siso_rdy)
     );
   
-    -- FIFO required as ss_parallel_sp does not have src flow control and dp_offload de-asserts
-    -- its snk_out.ready.
-    u_dp_fifo_sc : ENTITY dp_lib.dp_fifo_sc
+    -- FIFO required as ss_parallel_sp does not have src flow control 
+    u_dp_fifo_sc_sp : ENTITY dp_lib.dp_fifo_sc
     GENERIC MAP (
       g_data_w      => c_nof_complex*c_rsp_terminal_subband_dat_w,
       g_bsn_w       => c_dp_stream_bsn_w,
@@ -460,10 +468,108 @@ BEGIN
       snk_out     => OPEN,   
       snk_in      => ss_parallel_sp_src_out_arr(0),
   
-      src_in      => dp_fifo_sc_src_in_arr(0),
-      src_out     => dp_fifo_sc_src_out_arr(0)
+      src_in      => ss_parallel_sp_fifo_src_in_arr(0),
+      src_out     => ss_parallel_sp_fifo_src_out_arr(0)
+    );
+
+    -----------------------------------------------------------------------------
+    -- Subband (SB) reordering for 8b mode:
+    -- . When in 8b mode, input subbands are no longer grouped together like in 16b mode; they're interleaved.
+    -- . GPU correlator would then need to de-interleave 8b subbands which would take too many CPU resources.
+    -- . So we need to group together the 8b subbands as shown as '8b output format' below:
+    -- 16b  input format : [ [(SB0,SP0),            ..            ,(SB0,SP95)], .. , [(SB8 ,SP0),             ..             ,(SB8 , SP95)] ] data bits 15..0
+    --  8b  input format : [ [(SB0,SP0),            ..            ,(SB0,SP95)], .. , [(SB16,SP0),             ..             ,(SB16, SP95)] ] data bits  7..0 // interleaved 8b subbands,  bad for GPU machine
+    --                     [ [(SB1,SP0),            ..            ,(SB1,SP95)], .. , [(SB17,SP0),             ..             ,(SB17, SP95)] ] data bits 15..8 // interleaved 8b subbands,  bad for GPU machine
+    --  8b output format : [ [(SB0,SP0),..,(SB0,SP94),(SB1,SP0),..,(SB1,SP94)], .. , [(SB16,SP0),..,(SB16,SP94),(SB17,SP0),..,(SB17, SP94)] ] data bits  7..0 // reordered   8b subbands, good for GPU machine
+    --                     [ [(SB0,SP1),..,(SB0,SP95),(SB1,SP1),..,(SB1,SP95)], .. , [(SB16,SP1),..,(SB16,SP95),(SB17,SP1),..,(SB17, SP95)] ] data bits 15..8 // reordered   8b subbands, good for GPU machine
+    -----------------------------------------------------------------------------
+ 
+    -- Rewire 1*16b to 2*8b
+    p_connect_sb : PROCESS(ss_parallel_sp_fifo_src_out_arr, ss_parallel_sb_snk_out_arr)
+    BEGIN
+      -- ctrl
+      ss_parallel_sb_snk_in_arr(0) <= ss_parallel_sp_fifo_src_out_arr(0);
+      ss_parallel_sb_snk_in_arr(1) <= ss_parallel_sp_fifo_src_out_arr(0);
+      -- flow control in opposite direction
+      ss_parallel_sp_fifo_src_in_arr(0) <= ss_parallel_sb_snk_out_arr(0);
+      -- Subband data: even 8b subband indices are located in former 16b real part
+      ss_parallel_sb_snk_in_arr(0).im(7 DOWNTO 0) <= ss_parallel_sp_fifo_src_out_arr(0).re(15 DOWNTO 8);
+      ss_parallel_sb_snk_in_arr(0).re(7 DOWNTO 0) <= ss_parallel_sp_fifo_src_out_arr(0).re( 7 DOWNTO 0);
+      -- subband data: odd  8b subband indices are located in former 16b imaginary part
+      ss_parallel_sb_snk_in_arr(1).im(7 DOWNTO 0) <= ss_parallel_sp_fifo_src_out_arr(0).im(15 DOWNTO 8);
+      ss_parallel_sb_snk_in_arr(1).re(7 DOWNTO 0) <= ss_parallel_sp_fifo_src_out_arr(0).im( 7 DOWNTO 0);
+    END PROCESS;
+
+    -- Re-order the two 8b subband streams
+    u_ss_parallel_sb : ENTITY ss_lib.ss_parallel
+    GENERIC MAP(                         
+      g_nof_inputs            => 2,
+      g_nof_internals         => 2,
+      g_nof_outputs           => 2,
+      g_dsp_data_w            => 8,
+      g_frame_size_in         => c_rsp_terminal_nof_subbands_per_lane * c_rsp_terminal_nof_lanes, -- = 864 (9*96),
+      g_frame_size_out        => c_rsp_terminal_nof_subbands_per_lane * c_rsp_terminal_nof_lanes, -- = 864 (9*96)
+      g_reorder_in_file_name  => "hex/ss_parallel_sb_16b_reorder_in.hex",
+      g_ss_wide_file_prefix   => "hex/ss_parallel_sb_16b_ss_wide",
+      g_reorder_out_file_name => "hex/ss_parallel_sb_16b_reorder_out.hex"
+    )
+    PORT MAP (
+      mm_rst                  => mm_rst,
+      mm_clk                  => mm_clk,
+      dp_rst                  => dp_rst,
+      dp_clk                  => dp_clk,   
+
+      ram_ss_reorder_in_mosi  => c_mem_mosi_rst,
+      ram_ss_reorder_in_miso  => OPEN,
+      ram_ss_reorder_out_mosi => c_mem_mosi_rst,
+      ram_ss_reorder_out_miso => OPEN,
+      ram_ss_ss_wide_mosi     => c_mem_mosi_rst,
+      ram_ss_ss_wide_miso     => OPEN,
+
+      input_sosi_arr          => ss_parallel_sb_snk_in_arr,
+      input_siso_arr          => ss_parallel_sb_snk_out_arr,
+      output_sosi_arr         => ss_parallel_sb_src_out_arr,
+      output_siso_arr         => (OTHERS=>c_dp_siso_rdy)
     );
+
+    -- Rewire 2*8b back to 1*16b
+    p_connect : PROCESS(ss_parallel_sb_src_out_arr)
+    BEGIN
+      -- ctrl
+      ss_parallel_sb_fifo_snk_in_arr(0) <= ss_parallel_sb_src_out_arr(0);
+      -- Data
+      ss_parallel_sb_fifo_snk_in_arr(0).im(15 DOWNTO 0) <= ss_parallel_sb_src_out_arr(1).im(7 DOWNTO 0) & ss_parallel_sb_src_out_arr(1).re(7 DOWNTO 0);
+      ss_parallel_sb_fifo_snk_in_arr(0).re(15 DOWNTO 0) <= ss_parallel_sb_src_out_arr(0).im(7 DOWNTO 0) & ss_parallel_sb_src_out_arr(0).re(7 DOWNTO 0);
+    END PROCESS;
+
+    -- FIFO required as ss_parallel_sb does not have src flow control 
+    u_dp_fifo_sc_sb : ENTITY dp_lib.dp_fifo_sc
+    GENERIC MAP (
+      g_data_w      => c_nof_complex*c_rsp_terminal_subband_dat_w,
+      g_bsn_w       => c_dp_stream_bsn_w,
+      g_empty_w     => c_dp_stream_empty_w,
+      g_channel_w   => c_dp_stream_channel_w,
+      g_error_w     => c_dp_stream_error_w,
+      g_use_complex => TRUE,
+      g_use_bsn     => TRUE,
+      g_use_empty   => TRUE,
+      g_use_channel => TRUE,
+      g_use_error   => TRUE,
+      g_use_sync    => TRUE,
+      g_use_ctrl    => TRUE,
+      g_fifo_size   => 100 
+    )
+    PORT MAP (
+      rst         => dp_rst,
+      clk         => dp_clk,
   
+      snk_out     => OPEN,   
+      snk_in      => ss_parallel_sb_fifo_snk_in_arr(0),
+  
+      src_in      => ss_parallel_sb_fifo_src_in_arr(0),
+      src_out     => ss_parallel_sb_fifo_src_out_arr(0)
+    );
+
     -----------------------------------------------------------------------------
     -- Subband offload:
     -- . Selects 8 out of 9 subbands per block = 8*96=768 32b words
@@ -478,8 +584,8 @@ BEGIN
       dp_rst                         => dp_rst,
       dp_clk                         => dp_clk,
   
-      snk_in_arr                     => dp_fifo_sc_src_out_arr,
-      snk_out_arr                    => dp_fifo_sc_src_in_arr,
+      snk_in_arr                     => ss_parallel_sb_fifo_src_out_arr,
+      snk_out_arr                    => ss_parallel_sb_fifo_src_in_arr,
   
       src_out_arr                    => aartfaac_bn_sdo_udp_sdo_src_out_arr,
       src_in_arr                     => aartfaac_bn_sdo_udp_sdo_src_in_arr,