diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc b/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc
index 04c23602d594a733a1676213340343607cd7624c..7733d158cf71a1023c724e94c39568e697824045 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc
@@ -191,7 +191,7 @@ void BGL_Processing::checkConsistency(CS1_Parset *parset) const
 #if !defined C_IMPLEMENTATION
   ASSERT(parset->BGLintegrationSteps() % 16		 == 0);
 
-  ASSERT(_FIR_constants_used.input_type			 == INPUT_TYPE);
+  ASSERT(_FIR_constants_used.nr_bits_per_sample		 == NR_BITS_PER_SAMPLE);
   ASSERT(_FIR_constants_used.nr_subband_channels	 == NR_SUBBAND_CHANNELS);
   ASSERT(_FIR_constants_used.nr_taps			 == NR_TAPS);
   ASSERT(_FIR_constants_used.nr_polarizations		 == NR_POLARIZATIONS);
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S b/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S
index 20283314686229d825b3e841d29a6c912979ab92..a99f849c02a2ef54db7be7dd9c1af063307c530f 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S
@@ -22,13 +22,15 @@
 #include <CS1_Interface/CS1_Config.h>
 
 #define I4COMPLEX_SIZE	 1
+#define I8COMPLEX_SIZE	 2
 #define I16COMPLEX_SIZE	 4
 #define FCOMPLEX_SIZE	 8
 #define DCOMPLEX_SIZE	16
+#define FLOAT_SIZE	 4
 
 .global	_FIR_constants_used
 _FIR_constants_used:
-	.long	INPUT_TYPE
+	.long	NR_BITS_PER_SAMPLE
 	.long	NR_SUBBAND_CHANNELS
 	.long	NR_TAPS
 	.long	NR_POLARIZATIONS
@@ -208,266 +210,10 @@ L5:
 	blr
 
 
-#if INPUT_TYPE == I4COMPLEX_TYPE
+#if NR_BITS_PER_SAMPLE == 4
 
 .align	5
-fp_table:
-	.long	0x00000000, 0x00000000
-	.long	0x3F800000, 0x00000000
-	.long	0x40000000, 0x00000000
-	.long	0x40400000, 0x00000000
-	.long	0x40800000, 0x00000000
-	.long	0x40A00000, 0x00000000
-	.long	0x40C00000, 0x00000000
-	.long	0x40E00000, 0x00000000
-	.long	0xC1000000, 0x00000000
-	.long	0xC0E00000, 0x00000000
-	.long	0xC0C00000, 0x00000000
-	.long	0xC0A00000, 0x00000000
-	.long	0xC0800000, 0x00000000
-	.long	0xC0400000, 0x00000000
-	.long	0xC0000000, 0x00000000
-	.long	0xBF800000, 0x00000000
-	.long	0x00000000, 0x3F800000
-	.long	0x3F800000, 0x3F800000
-	.long	0x40000000, 0x3F800000
-	.long	0x40400000, 0x3F800000
-	.long	0x40800000, 0x3F800000
-	.long	0x40A00000, 0x3F800000
-	.long	0x40C00000, 0x3F800000
-	.long	0x40E00000, 0x3F800000
-	.long	0xC1000000, 0x3F800000
-	.long	0xC0E00000, 0x3F800000
-	.long	0xC0C00000, 0x3F800000
-	.long	0xC0A00000, 0x3F800000
-	.long	0xC0800000, 0x3F800000
-	.long	0xC0400000, 0x3F800000
-	.long	0xC0000000, 0x3F800000
-	.long	0xBF800000, 0x3F800000
-	.long	0x00000000, 0x40000000
-	.long	0x3F800000, 0x40000000
-	.long	0x40000000, 0x40000000
-	.long	0x40400000, 0x40000000
-	.long	0x40800000, 0x40000000
-	.long	0x40A00000, 0x40000000
-	.long	0x40C00000, 0x40000000
-	.long	0x40E00000, 0x40000000
-	.long	0xC1000000, 0x40000000
-	.long	0xC0E00000, 0x40000000
-	.long	0xC0C00000, 0x40000000
-	.long	0xC0A00000, 0x40000000
-	.long	0xC0800000, 0x40000000
-	.long	0xC0400000, 0x40000000
-	.long	0xC0000000, 0x40000000
-	.long	0xBF800000, 0x40000000
-	.long	0x00000000, 0x40400000
-	.long	0x3F800000, 0x40400000
-	.long	0x40000000, 0x40400000
-	.long	0x40400000, 0x40400000
-	.long	0x40800000, 0x40400000
-	.long	0x40A00000, 0x40400000
-	.long	0x40C00000, 0x40400000
-	.long	0x40E00000, 0x40400000
-	.long	0xC1000000, 0x40400000
-	.long	0xC0E00000, 0x40400000
-	.long	0xC0C00000, 0x40400000
-	.long	0xC0A00000, 0x40400000
-	.long	0xC0800000, 0x40400000
-	.long	0xC0400000, 0x40400000
-	.long	0xC0000000, 0x40400000
-	.long	0xBF800000, 0x40400000
-	.long	0x00000000, 0x40800000
-	.long	0x3F800000, 0x40800000
-	.long	0x40000000, 0x40800000
-	.long	0x40400000, 0x40800000
-	.long	0x40800000, 0x40800000
-	.long	0x40A00000, 0x40800000
-	.long	0x40C00000, 0x40800000
-	.long	0x40E00000, 0x40800000
-	.long	0xC1000000, 0x40800000
-	.long	0xC0E00000, 0x40800000
-	.long	0xC0C00000, 0x40800000
-	.long	0xC0A00000, 0x40800000
-	.long	0xC0800000, 0x40800000
-	.long	0xC0400000, 0x40800000
-	.long	0xC0000000, 0x40800000
-	.long	0xBF800000, 0x40800000
-	.long	0x00000000, 0x40A00000
-	.long	0x3F800000, 0x40A00000
-	.long	0x40000000, 0x40A00000
-	.long	0x40400000, 0x40A00000
-	.long	0x40800000, 0x40A00000
-	.long	0x40A00000, 0x40A00000
-	.long	0x40C00000, 0x40A00000
-	.long	0x40E00000, 0x40A00000
-	.long	0xC1000000, 0x40A00000
-	.long	0xC0E00000, 0x40A00000
-	.long	0xC0C00000, 0x40A00000
-	.long	0xC0A00000, 0x40A00000
-	.long	0xC0800000, 0x40A00000
-	.long	0xC0400000, 0x40A00000
-	.long	0xC0000000, 0x40A00000
-	.long	0xBF800000, 0x40A00000
-	.long	0x00000000, 0x40C00000
-	.long	0x3F800000, 0x40C00000
-	.long	0x40000000, 0x40C00000
-	.long	0x40400000, 0x40C00000
-	.long	0x40800000, 0x40C00000
-	.long	0x40A00000, 0x40C00000
-	.long	0x40C00000, 0x40C00000
-	.long	0x40E00000, 0x40C00000
-	.long	0xC1000000, 0x40C00000
-	.long	0xC0E00000, 0x40C00000
-	.long	0xC0C00000, 0x40C00000
-	.long	0xC0A00000, 0x40C00000
-	.long	0xC0800000, 0x40C00000
-	.long	0xC0400000, 0x40C00000
-	.long	0xC0000000, 0x40C00000
-	.long	0xBF800000, 0x40C00000
-	.long	0x00000000, 0x40E00000
-	.long	0x3F800000, 0x40E00000
-	.long	0x40000000, 0x40E00000
-	.long	0x40400000, 0x40E00000
-	.long	0x40800000, 0x40E00000
-	.long	0x40A00000, 0x40E00000
-	.long	0x40C00000, 0x40E00000
-	.long	0x40E00000, 0x40E00000
-	.long	0xC1000000, 0x40E00000
-	.long	0xC0E00000, 0x40E00000
-	.long	0xC0C00000, 0x40E00000
-	.long	0xC0A00000, 0x40E00000
-	.long	0xC0800000, 0x40E00000
-	.long	0xC0400000, 0x40E00000
-	.long	0xC0000000, 0x40E00000
-	.long	0xBF800000, 0x40E00000
-	.long	0x00000000, 0xC1000000
-	.long	0x3F800000, 0xC1000000
-	.long	0x40000000, 0xC1000000
-	.long	0x40400000, 0xC1000000
-	.long	0x40800000, 0xC1000000
-	.long	0x40A00000, 0xC1000000
-	.long	0x40C00000, 0xC1000000
-	.long	0x40E00000, 0xC1000000
-	.long	0xC1000000, 0xC1000000
-	.long	0xC0E00000, 0xC1000000
-	.long	0xC0C00000, 0xC1000000
-	.long	0xC0A00000, 0xC1000000
-	.long	0xC0800000, 0xC1000000
-	.long	0xC0400000, 0xC1000000
-	.long	0xC0000000, 0xC1000000
-	.long	0xBF800000, 0xC1000000
-	.long	0x00000000, 0xC0E00000
-	.long	0x3F800000, 0xC0E00000
-	.long	0x40000000, 0xC0E00000
-	.long	0x40400000, 0xC0E00000
-	.long	0x40800000, 0xC0E00000
-	.long	0x40A00000, 0xC0E00000
-	.long	0x40C00000, 0xC0E00000
-	.long	0x40E00000, 0xC0E00000
-	.long	0xC1000000, 0xC0E00000
-	.long	0xC0E00000, 0xC0E00000
-	.long	0xC0C00000, 0xC0E00000
-	.long	0xC0A00000, 0xC0E00000
-	.long	0xC0800000, 0xC0E00000
-	.long	0xC0400000, 0xC0E00000
-	.long	0xC0000000, 0xC0E00000
-	.long	0xBF800000, 0xC0E00000
-	.long	0x00000000, 0xC0C00000
-	.long	0x3F800000, 0xC0C00000
-	.long	0x40000000, 0xC0C00000
-	.long	0x40400000, 0xC0C00000
-	.long	0x40800000, 0xC0C00000
-	.long	0x40A00000, 0xC0C00000
-	.long	0x40C00000, 0xC0C00000
-	.long	0x40E00000, 0xC0C00000
-	.long	0xC1000000, 0xC0C00000
-	.long	0xC0E00000, 0xC0C00000
-	.long	0xC0C00000, 0xC0C00000
-	.long	0xC0A00000, 0xC0C00000
-	.long	0xC0800000, 0xC0C00000
-	.long	0xC0400000, 0xC0C00000
-	.long	0xC0000000, 0xC0C00000
-	.long	0xBF800000, 0xC0C00000
-	.long	0x00000000, 0xC0A00000
-	.long	0x3F800000, 0xC0A00000
-	.long	0x40000000, 0xC0A00000
-	.long	0x40400000, 0xC0A00000
-	.long	0x40800000, 0xC0A00000
-	.long	0x40A00000, 0xC0A00000
-	.long	0x40C00000, 0xC0A00000
-	.long	0x40E00000, 0xC0A00000
-	.long	0xC1000000, 0xC0A00000
-	.long	0xC0E00000, 0xC0A00000
-	.long	0xC0C00000, 0xC0A00000
-	.long	0xC0A00000, 0xC0A00000
-	.long	0xC0800000, 0xC0A00000
-	.long	0xC0400000, 0xC0A00000
-	.long	0xC0000000, 0xC0A00000
-	.long	0xBF800000, 0xC0A00000
-	.long	0x00000000, 0xC0800000
-	.long	0x3F800000, 0xC0800000
-	.long	0x40000000, 0xC0800000
-	.long	0x40400000, 0xC0800000
-	.long	0x40800000, 0xC0800000
-	.long	0x40A00000, 0xC0800000
-	.long	0x40C00000, 0xC0800000
-	.long	0x40E00000, 0xC0800000
-	.long	0xC1000000, 0xC0800000
-	.long	0xC0E00000, 0xC0800000
-	.long	0xC0C00000, 0xC0800000
-	.long	0xC0A00000, 0xC0800000
-	.long	0xC0800000, 0xC0800000
-	.long	0xC0400000, 0xC0800000
-	.long	0xC0000000, 0xC0800000
-	.long	0xBF800000, 0xC0800000
-	.long	0x00000000, 0xC0400000
-	.long	0x3F800000, 0xC0400000
-	.long	0x40000000, 0xC0400000
-	.long	0x40400000, 0xC0400000
-	.long	0x40800000, 0xC0400000
-	.long	0x40A00000, 0xC0400000
-	.long	0x40C00000, 0xC0400000
-	.long	0x40E00000, 0xC0400000
-	.long	0xC1000000, 0xC0400000
-	.long	0xC0E00000, 0xC0400000
-	.long	0xC0C00000, 0xC0400000
-	.long	0xC0A00000, 0xC0400000
-	.long	0xC0800000, 0xC0400000
-	.long	0xC0400000, 0xC0400000
-	.long	0xC0000000, 0xC0400000
-	.long	0xBF800000, 0xC0400000
-	.long	0x00000000, 0xC0000000
-	.long	0x3F800000, 0xC0000000
-	.long	0x40000000, 0xC0000000
-	.long	0x40400000, 0xC0000000
-	.long	0x40800000, 0xC0000000
-	.long	0x40A00000, 0xC0000000
-	.long	0x40C00000, 0xC0000000
-	.long	0x40E00000, 0xC0000000
-	.long	0xC1000000, 0xC0000000
-	.long	0xC0E00000, 0xC0000000
-	.long	0xC0C00000, 0xC0000000
-	.long	0xC0A00000, 0xC0000000
-	.long	0xC0800000, 0xC0000000
-	.long	0xC0400000, 0xC0000000
-	.long	0xC0000000, 0xC0000000
-	.long	0xBF800000, 0xC0000000
-	.long	0x00000000, 0xBF800000
-	.long	0x3F800000, 0xBF800000
-	.long	0x40000000, 0xBF800000
-	.long	0x40400000, 0xBF800000
-	.long	0x40800000, 0xBF800000
-	.long	0x40A00000, 0xBF800000
-	.long	0x40C00000, 0xBF800000
-	.long	0x40E00000, 0xBF800000
-	.long	0xC1000000, 0xBF800000
-	.long	0xC0E00000, 0xBF800000
-	.long	0xC0C00000, 0xBF800000
-	.long	0xC0A00000, 0xBF800000
-	.long	0xC0800000, 0xBF800000
-	.long	0xC0400000, 0xBF800000
-	.long	0xC0000000, 0xBF800000
-	.long	0xBF800000, 0xBF800000
+.comm	_FIR_fp_table,16*16*FCOMPLEX_SIZE,8
 
 .global	_filter
 _filter:
@@ -484,7 +230,7 @@ _filter:
 #	internally used:
 #	r9 :	  8
 #	r10 :	  512
-#	r11 :	  pointer to fp_table
+#	r11 :	  pointer to _FIR_fp_table
 #	r28-r31 : prefetched samples
 #	f0-f15 :  delay line (real in primary, imaginary in secondary unit)
 #	f16-f23 : weights (these are real values alternately stored in primary
@@ -523,8 +269,8 @@ _filter:
 	# convert 15 i4complex numbers to fcomplex
 	li	10,I4COMPLEX_SIZE*NR_SUBBAND_CHANNELS*NR_POLARIZATIONS
 				; lbzx	 29,0,5	# fetch FIR history samples
-	lis     11,fp_table@ha	; lbzux  30,5,10
-	la      11,fp_table@l(11);lbzux	 31,5,10
+	lis     11,_FIR_fp_table@ha	; lbzux  30,5,10
+	la      11,_FIR_fp_table@l(11);lbzux	 31,5,10
 	slwi	29,29,3		; lbzux	 28,5,10
 				; lfpsx  1,11,29
 	slwi	30,30,3		; lbzux	 29,5,10
@@ -571,19 +317,54 @@ _filter:
 loop:
 	# time steps 0-5
 
-	fxcpmadd 24,17,14,24
-	fxcsmadd 25,17,14,25
-	fxcpmadd 26,18,14,26
-	fxcsmadd 27,18,14,27
-	fxcpmadd 28,19,14,28
-	fxcsmadd 29,19,14,29
+	fxcsmadd 24,23,1,24	; slwi	 28,28,3
+	fxcpmadd 25,20,9,25	; lfpsx	 0,11,28
+	fxcsmadd 26,20,9,26
+	fxcpmadd 27,21,9,27
+	fxcsmadd 28,21,9,28
+	fxcpmadd 29,22,9,29	; lbzux	 28,5,10
+
+	fxcpmadd 24,23,2,24
+	fxcsmadd 25,23,2,25
+	fxcpmadd 26,20,10,26
+	fxcsmadd 27,20,10,27
+	fxcpmadd 28,21,10,28
+	fxcsmadd 29,21,10,29
+
+	fxcsmadd 24,22,3,24	; slwi	 29,29,3
+	fxcpmadd 25,23,3,25	; lfpsx	 1,11,29
+	fxcsmadd 26,23,3,26
+	fxcpmadd 27,20,11,27
+	fxcsmadd 28,20,11,28
+	fxcpmadd 29,21,11,29	; lbzux	 29,5,10
+
+	fxcpmadd 24,22,4,24
+	fxcsmadd 25,22,4,25
+	fxcpmadd 26,23,4,26
+	fxcsmadd 27,23,4,27
+	fxcpmadd 28,20,12,28
+	fxcsmadd 29,20,12,29
 
-	fxcpmadd 24,21,6,24	; slwi	 28,28,3
-	fxcsmadd 25,21,6,25	; lfpsx	 0,11,28
-	fxcpmadd 26,22,6,26	; lbzux	 28,5,10
+	fxcsmadd 24,21,5,24	; slwi	 30,30,3
+	fxcpmadd 25,22,5,25	; lfpsx	 2,11,30
+	fxcsmadd 26,22,5,26
+	fxcpmadd 27,23,5,27
+	fxcsmadd 28,23,5,28
+	fxcpmadd 29,20,13,29	; lbzux	 30,5,10
+
+	fxcpmadd 24,21,6,24
+	fxcsmadd 25,21,6,25
+	fxcpmadd 26,22,6,26
 	fxcsmadd 27,22,6,27
 	fxcpmadd 28,23,6,28
 	fxcsmadd 29,23,6,29
+	
+	fxcsmadd 24,20,7,24	; slwi	 31,31,3
+	fxcpmadd 25,21,7,25	; lfpsx	 3,11,31
+	fxcsmadd 26,21,7,26
+	fxcpmadd 27,22,7,27
+	fxcsmadd 28,22,7,28
+	fxcpmadd 29,23,7,29	; lbzux	 31,5,10
 
 	fxcpmadd 24,20,8,24
 	fxcsmadd 25,20,8,25
@@ -592,40 +373,26 @@ loop:
 	fxcpmadd 28,22,8,28
 	fxcsmadd 29,22,8,29
 
+	fxcpmadd 24,17,14,24	; slwi	 28,28,3
+	fxcsmadd 25,17,14,25	; lfpsx	 4,11,28
+	fxcpmadd 26,18,14,26
+	fxcsmadd 27,18,14,27
+	fxcpmadd 28,19,14,28
+	fxcsmadd 29,19,14,29	; lbzux	 28,5,10
+	
 	fxcpmadd 24,16,0,24
 	fxcsmadd 25,16,0,25
 	fxcpmadd 26,17,0,26
 	fxcsmadd 27,17,0,27
 	fxcpmadd 28,18,0,28
-	fxcsmadd 29,18,0,29	; slwi	 29,29,3
-
-	fxcsmadd 24,23,1,24	; lfpsx	 1,11,29
-	fxcpmadd 25,20,9,25	; lbzux	 29,5,10
-	fxcsmadd 26,20,9,26
-	fxcpmadd 27,21,9,27
-	fxcsmadd 28,21,9,28
-	fxcpmadd 29,22,9,29
+	fxcsmadd 29,18,0,29
 
-	fxcsmadd 24,19,9,24
-	fxcpmadd 25,16,1,25
+	fxcsmadd 24,19,9,24	; slwi	 29,29,3
+	fxcpmadd 25,16,1,25	; lfpsx	 5,11,29
 	fxcsmadd 26,16,1,26
 	fxcpmadd 27,17,1,27
 	fxcsmadd 28,17,1,28
-	fxcpmadd 29,18,1,29
-
-	fxcpmadd 24,23,2,24	; slwi	 30,30,3
-	fxcsmadd 25,23,2,25	; lfpsx	 2,11,30
-	fxcpmadd 26,20,10,26	; lbzux	 30,5,10
-	fxcsmadd 27,20,10,27
-	fxcpmadd 28,21,10,28
-	fxcsmadd 29,21,10,29
-	
-	fxcsmadd 24,20,7,24
-	fxcpmadd 25,21,7,25
-	fxcsmadd 26,21,7,26
-	fxcpmadd 27,22,7,27
-	fxcsmadd 28,22,7,28
-	fxcpmadd 29,23,7,29
+	fxcpmadd 29,18,1,29	; lbzux	 29,5,10
 
 	fxcpmadd 24,19,10,24
 	fxcsmadd 25,19,10,25
@@ -634,13 +401,6 @@ loop:
 	fxcpmadd 28,17,2,28
 	fxcsmadd 29,17,2,29
 
-	fxcsmadd 24,22,3,24
-	fxcpmadd 25,23,3,25	; slwi	 31,31,3
-	fxcsmadd 26,23,3,26	; lfpsx	 3,11,31
-	fxcpmadd 27,20,11,27	; lbzux	 31,5,10
-	fxcsmadd 28,20,11,28
-	fxcpmadd 29,21,11,29
-
 	fxcsmadd 24,18,11,24
 	fxcpmadd 25,19,11,25
 	fxcsmadd 26,19,11,26
@@ -648,13 +408,6 @@ loop:
 	fxcsmadd 28,16,3,28
 	fxcpmadd 29,17,3,29
 
-	fxcpmadd 24,22,4,24
-	fxcsmadd 25,22,4,25
-	fxcpmadd 26,23,4,26	; slwi	 28,28,3
-	fxcsmadd 27,23,4,27	; lfpsx	 4,11,28
-	fxcpmadd 28,20,12,28	; lbzux	 28,5,10
-	fxcsmadd 29,20,12,29
-
 	fxcpmadd 24,18,12,24
 	fxcsmadd 25,18,12,25
 	fxcpmadd 26,19,12,26
@@ -662,13 +415,6 @@ loop:
 	fxcpmadd 28,16,4,28
 	fxcsmadd 29,16,4,29
 
-	fxcsmadd 24,21,5,24
-	fxcpmadd 25,22,5,25
-	fxcsmadd 26,22,5,26
-	fxcpmadd 27,23,5,27	; slwi	 29,29,3
-	fxcsmadd 28,23,5,28	; lfpsx	 5,11,29
-	fxcpmadd 29,20,13,29	; lbzux	 29,5,10
-
 	fxcsmadd 24,17,13,24
 	fxcpmadd 25,18,13,25
 	fxcsmadd 26,18,13,26
@@ -802,112 +548,1200 @@ loop:
 	fxcpmadd 27,18,10,27
 	fxcsmadd 28,18,10,28
 
-	fxcpmadd 24,20,3,24
-	fxcsmadd 25,20,3,25
-	fxcpmadd 26,21,3,26
-	fxcsmadd 27,21,3,27
-	fxcpmadd 28,22,3,28
-	
-	fxcpmadd 24,16,11,24
-	fxcsmadd 25,16,11,25
-	fxcpmadd 26,17,11,26
-	fxcsmadd 27,17,11,27
-	fxcpmadd 28,18,11,28	; slwi	 28,28,3
+	fxcpmadd 24,20,3,24
+	fxcsmadd 25,20,3,25
+	fxcpmadd 26,21,3,26
+	fxcsmadd 27,21,3,27
+	fxcpmadd 28,22,3,28
+	
+	fxcpmadd 24,16,11,24
+	fxcsmadd 25,16,11,25
+	fxcpmadd 26,17,11,26
+	fxcsmadd 27,17,11,27
+	fxcpmadd 28,18,11,28	; slwi	 28,28,3
+
+	fxcsmadd 24,23,12,24	; lfpsx	 12,11,28
+	fxcpmadd 25,20,4,25	; lbzux	 28,5,10
+	fxcsmadd 26,20,4,26
+	fxcpmadd 27,21,4,27
+	fxcsmadd 28,21,4,28
+
+	fxcsmadd 24,19,4,24
+	fxcpmadd 25,16,12,25
+	fxcsmadd 26,16,12,26
+	fxcpmadd 27,17,12,27
+	fxcsmadd 28,17,12,28
+
+	fxcpmadd 24,23,13,24	; slwi	 29,29,3
+	fxcsmadd 25,23,13,25	; lfpsx	 13,11,29
+	fxcpmadd 26,20,5,26	; lbzux	 29,5,10
+	fxcsmadd 27,20,5,27
+	fxcpmadd 28,21,5,28
+	
+	fxcsmadd 24,17,8,24
+	fxcpmadd 25,18,8,25
+	fxcsmadd 26,18,8,26
+	fxcpmadd 27,19,8,27
+	fxcsmadd 28,19,8,28
+
+	fxcpmadd 24,19,5,24
+	fxcsmadd 25,19,5,25
+	fxcpmadd 26,16,13,26
+	fxcsmadd 27,16,13,27
+	fxcpmadd 28,17,13,28
+
+	fxcsmadd 24,22,14,24
+	fxcpmadd 25,23,14,25	; slwi	 30,30,3
+	fxcsmadd 26,23,14,26	; lfpsx	 14,11,30
+	fxcpmadd 27,20,6,27	; lbzux	 30,5,10
+	fxcsmadd 28,20,6,28
+	
+	fxcsmadd 24,18,6,24
+	fxcpmadd 25,19,6,25
+	fxcsmadd 26,19,6,26
+	fxcpmadd 27,16,14,27
+	fxcsmadd 28,16,14,28
+
+	fxcpmadd 24,21,1,24
+	fxcsmadd 25,21,1,25
+	fxcpmadd 26,22,1,26
+	fxcsmadd 27,22,1,27
+	fxcpmadd 28,23,1,28
+
+	fxcpmadd 24,22,15,24
+	fxcsmadd 25,22,15,25
+	fxcpmadd 26,23,15,26	; slwi	 31,31,3
+	fxcsmadd 27,23,15,27	; lfpsx	 15,11,31
+	fxcpmadd 28,20,7,28	; lbzux	 31,5,10
+
+	fxcpmadd 24,18,7,24
+	fxcsmadd 25,18,7,25
+	fxcpmadd 26,19,7,26
+	fxcsmadd 27,19,7,27
+	fxcpmadd 28,16,15,28
+
+	fxpmul	 29,19,15	; stfpsux 24,6,9
+	fxsmul	 24,16,15	; stfpsux 25,6,9
+	fxpmul	 25,17,15	; stfpsux 26,6,9
+	fxsmul	 26,17,15	; stfpsux 27,6,9
+	fxpmul	 27,18,15	; stfpsux 28,6,9
+	fxsmul	 28,18,15
+
+	bdnz	loop
+
+	la	8,16(1)	# restore call-saved registers
+	li	9,DCOMPLEX_SIZE
+
+	lfpdx	14,0,8
+	lfpdux	15,8,9
+	lfpdux	16,8,9
+	lfpdux	17,8,9
+	lfpdux	18,8,9
+	lfpdux	19,8,9
+	lfpdux	20,8,9
+	lfpdux	21,8,9
+	lfpdux	22,8,9
+	lfpdux	23,8,9
+	lfpdux	24,8,9
+	lfpdux	25,8,9
+	lfpdux	26,8,9
+	lfpdux	27,8,9
+	lfpdux	28,8,9
+	lfpdux	29,8,9
+	lmw	28,0(1)		# restore r28 ... r31
+
+	addi	1,1,272		# restore stack pointer
+	blr			# return
+
+#elif NR_BITS_PER_SAMPLE == 8
+
+.align	5
+.comm	_FIR_fp_table,256*256*FCOMPLEX_SIZE,8
+
+.global	_filter
+_filter:
+#	filters all samples for one station, one polarization
+
+#	arguments:
+#	r3 :	  pointer to delay line (fcomplex[16])  NOTE: USE OF THE DELAY
+#		  LINE IS COMMENTED OUT!!!!!!!!!!!!!!!!!!!!
+#	r4 :	  pointer to weights line (const fcomplex[16])
+#	r5 :	  pointer to first sample (const i8complex[16*r7])
+#	r6 :	  pointer to result (fcomplex *)
+#	r7 :	  number of samples / 16
+
+#	internally used:
+#	r9 :	  8
+#	r10 :	  1024
+#	r11 :	  ptr to constant table
+#	r28-r31 : prefetched samples
+#	f0-f15 :  delay line (real in primary, imaginary in secondary unit)
+#	f16-f23 : weights (these are real values alternately stored in primary
+#			   and secondary units)
+#	f24-f29 : sums
+
+#	The implementation works on 5 or 6 time samples concurrently, to avoid
+#	stalls in the double hummer.  This unfortunately leads to totally
+#	incomprehensible code.  The loop processes 16 samples at a time.
+#	The input is converted from int16complex to dcomplex by black magic,
+#	making the code even harder to understand.
+
+	mtctr	7		# set number of iterations
+
+	li	9,-DCOMPLEX_SIZE
+	stfpdux 14,1,9		# save call-saved registers
+	stfpdux 15,1,9
+	stfpdux 16,1,9
+	stfpdux 17,1,9
+	stfpdux 18,1,9
+	stfpdux 19,1,9
+	stfpdux 20,1,9
+	stfpdux 21,1,9
+	stfpdux 22,1,9
+	stfpdux 23,1,9
+	stfpdux 24,1,9
+	stfpdux 25,1,9
+	stfpdux 26,1,9
+	stfpdux 27,1,9
+	stfpdux 28,1,9
+	stfpdux 29,1,9
+	stfpdux 30,1,9
+	stfpdux 31,1,9
+
+	subi	1,1,16
+	stmw	28,0(1)	# save r28 ... r31
+
+	lis	11,_FIR_fp_table@ha
+	li	9,FCOMPLEX_SIZE
+	la	11,_FIR_fp_table@l(11)
+	li	10,I8COMPLEX_SIZE*NR_SUBBAND_CHANNELS*NR_POLARIZATIONS
+	subi	6,6,FCOMPLEX_SIZE
+
+
+#if 0
+	lfpsx	1,0,3		# load delay line
+	lfpsux	2,3,9
+	lfpsux	3,3,9
+	lfpsux	4,3,9
+	lfpsux	5,3,9
+	lfpsux	6,3,9
+	lfpsux	7,3,9
+	lfpsux	8,3,9
+	lfpsux	9,3,9
+	lfpsux	10,3,9
+	lfpsux	11,3,9
+	lfpsux	12,3,9
+	lfpsux	13,3,9
+	lfpsux	14,3,9
+	lfpsux	15,3,9
+#endif
+
+	# convert 15 i16complex numbers to fcomplex
+	lhzx	29,0,5		# fetch FIR history samples
+	lhzux	30,5,10
+	lhzux	31,5,10
+	lhzux	28,5,10
+
+	rlwinm	12,29,3,13,28
+	lfpsx	1,11,12
+	lhzux	29,5,10
+
+	rlwinm	12,30,3,13,28
+	lfpsx	2,11,12
+	lhzux	30,5,10
+
+	rlwinm	12,31,3,13,28
+	lfpsx	3,11,12
+	lhzux	31,5,10
+
+	rlwinm	12,28,3,13,28
+	lfpsx	4,11,12
+	lhzux	28,5,10
+
+	rlwinm	12,29,3,13,28
+	lfpsx	5,11,12
+	lhzux	29,5,10
+
+	rlwinm	12,30,3,13,28
+	lfpsx	6,11,12
+	lhzux	30,5,10
+
+	rlwinm	12,31,3,13,28
+	lfpsx	7,11,12
+	lhzux	31,5,10
+
+	rlwinm	12,28,3,13,28
+	lfpsx	8,11,12
+	lhzux	28,5,10
+
+	rlwinm	12,29,3,13,28
+	lfpsx	9,11,12
+	lhzux	29,5,10
+
+	rlwinm	12,30,3,13,28
+	lfpsx	10,11,12
+	lhzux	30,5,10
+
+	rlwinm	12,31,3,13,28
+	lfpsx	11,11,12
+	lhzux	31,5,10
+
+	rlwinm	12,28,3,13,28
+	lfpsx	12,11,12
+	lhzux	28,5,10 # prefetch samples before entering loop
+
+	rlwinm	12,29,3,13,28
+	lfpsx	13,11,12
+	lhzux	29,5,10
+
+	rlwinm	12,30,3,13,28
+	lfpsx	14,11,12
+	lhzux	30,5,10
+
+	rlwinm	12,31,3,13,28
+	lfpsx	15,11,12
+	lhzux	31,5,10
+
+				; lfpsx	  16,0,4	# load weights line
+				; lfpsux  17,4,9
+				; lfpsux  18,4,9
+				; lfpsux  19,4,9
+				; lfpsux  20,4,9
+				; lfpsux  21,4,9
+				; lfpsux  22,4,9
+				; lfpsux  23,4,9
+
+	# essentially part of loop, but interleaved with
+	# the stores in the tail of the loop
+	fxpmul	 24,20,8
+	fxsmul	 25,20,8
+	fxpmul	 26,21,8
+	fxsmul	 27,21,8
+	fxpmul	 28,22,8
+	fxsmul	 29,22,8
+	fxpmul	 30,23,8
+	fxsmul	 31,23,8
+
+loop:
+	# time steps 0-7
+
+	# load 0
+
+	fxcsmadd 24,23,1,24	; rlwinm 12,28,3,13,28
+	# load 1
+	fxcpmadd 25,20,9,25	; lfpsx  0,11,12
+	fxcsmadd 26,20,9,26	; lhzux	 28,5,10
+	fxcpmadd 27,21,9,27
+	fxcsmadd 28,21,9,28
+	fxcpmadd 29,22,9,29
+	fxcsmadd 30,22,9,30
+	fxcpmadd 31,23,9,31
+
+	fxcpmadd 24,23,2,24	; rlwinm 12,29,3,13,28
+	fxcsmadd 25,23,2,25	; lfpsx  1,11,12
+	# load 2
+	fxcpmadd 26,20,10,26	; lhzux	 29,5,10
+	fxcsmadd 27,20,10,27
+	fxcpmadd 28,21,10,28
+	fxcsmadd 29,21,10,29
+	fxcpmadd 30,22,10,30
+	fxcsmadd 31,22,10,31
+
+	fxcsmadd 24,22,3,24	; rlwinm 12,30,3,13,28
+	fxcpmadd 25,23,3,25	; lfpsx  2,11,12
+	fxcsmadd 26,23,3,26	; lhzux	 30,5,10
+	# load 3
+	fxcpmadd 27,20,11,27
+	fxcsmadd 28,20,11,28
+	fxcpmadd 29,21,11,29
+	fxcsmadd 30,21,11,30
+	fxcpmadd 31,22,11,31
+
+	fxcpmadd 24,22,4,24	; rlwinm 12,31,3,13,28
+	fxcsmadd 25,22,4,25	; lfpsx  3,11,12
+	fxcpmadd 26,23,4,26	; lhzux	 31,5,10
+	fxcsmadd 27,23,4,27
+	# load 4
+	fxcpmadd 28,20,12,28
+	fxcsmadd 29,20,12,29
+	fxcpmadd 30,21,12,30
+	fxcsmadd 31,21,12,31
+
+	fxcsmadd 24,21,5,24
+	fxcpmadd 25,22,5,25
+	fxcsmadd 26,22,5,26
+	fxcpmadd 27,23,5,27
+	fxcsmadd 28,23,5,28
+	# load 5
+	fxcpmadd 29,20,13,29
+	fxcsmadd 30,20,13,30
+	fxcpmadd 31,21,13,31
+
+	fxcpmadd 24,21,6,24
+	fxcsmadd 25,21,6,25
+	fxcpmadd 26,22,6,26
+	fxcsmadd 27,22,6,27
+	fxcpmadd 28,23,6,28
+	fxcsmadd 29,23,6,29
+	# load 6
+	fxcpmadd 30,20,14,30
+	fxcsmadd 31,20,14,31
+
+	fxcsmadd 24,20,7,24
+	fxcpmadd 25,21,7,25
+	fxcsmadd 26,21,7,26
+	fxcpmadd 27,22,7,27	; rlwinm 12,28,3,13,28
+	fxcsmadd 28,22,7,28	; lfpsx	 4,11,12
+	fxcpmadd 29,23,7,29	; lhzux	 28,5,10
+	fxcsmadd 30,23,7,30
+	# load 7
+	fxcpmadd 31,20,15,31
+
+	# after loads
+	fxcpmadd 24,16,0,24
+	fxcsmadd 25,16,0,25
+	fxcpmadd 26,17,0,26
+	fxcsmadd 27,17,0,27
+	fxcpmadd 28,18,0,28
+	fxcsmadd 29,18,0,29
+	fxcpmadd 30,19,0,30
+	fxcsmadd 31,19,0,31	; rlwinm 12,29,3,13,28
+
+	fxcsmadd 24,19,9,24	; lfpsx  5,11,12
+	fxcpmadd 25,16,1,25	; lhzux	 29,5,10
+	fxcsmadd 26,16,1,26
+	fxcpmadd 27,17,1,27
+	fxcsmadd 28,17,1,28
+	fxcpmadd 29,18,1,29
+	fxcsmadd 30,18,1,30
+	fxcpmadd 31,19,1,31
+
+	fxcpmadd 24,19,10,24
+	fxcsmadd 25,19,10,25
+	fxcpmadd 26,16,2,26
+	fxcsmadd 27,16,2,27	; rlwinm 12,30,3,13,28
+	fxcpmadd 28,17,2,28	; lfpsx  6,11,12
+	fxcsmadd 29,17,2,29	; lhzux	 30,5,10
+	fxcpmadd 30,18,2,30
+	fxcsmadd 31,18,2,31
+
+	fxcsmadd 24,18,11,24
+	fxcpmadd 25,19,11,25
+	fxcsmadd 26,19,11,26
+	fxcpmadd 27,16,3,27
+	fxcsmadd 28,16,3,28
+	fxcpmadd 29,17,3,29
+	fxcsmadd 30,17,3,30
+	fxcpmadd 31,18,3,31	; rlwinm 12,31,3,13,28
+	
+	fxcpmadd 24,18,12,24	; lfpsx  7,11,12
+	fxcsmadd 25,18,12,25	; lhzux	 31,5,10
+	fxcpmadd 26,19,12,26
+	fxcsmadd 27,19,12,27
+	fxcpmadd 28,16,4,28
+	fxcsmadd 29,16,4,29
+	fxcpmadd 30,17,4,30
+	fxcsmadd 31,17,4,31
+
+	fxcsmadd 24,17,13,24
+	fxcpmadd 25,18,13,25
+	fxcsmadd 26,18,13,26
+	fxcpmadd 27,19,13,27
+	fxcsmadd 28,19,13,28
+	fxcpmadd 29,16,5,29
+	fxcsmadd 30,16,5,30
+	fxcpmadd 31,17,5,31	; rlwinm 12,28,3,13,28
+
+	fxcpmadd 24,17,14,24	; lfpsx  8,11,12
+	fxcsmadd 25,17,14,25	; lhzux	 28,5,10
+	fxcpmadd 26,18,14,26
+	fxcsmadd 27,18,14,27
+	fxcpmadd 28,19,14,28
+	fxcsmadd 29,19,14,29
+	fxcpmadd 30,16,6,30
+	fxcsmadd 31,16,6,31
+
+	fxcsmadd 24,16,15,24
+	fxcpmadd 25,17,15,25
+	fxcsmadd 26,17,15,26
+	fxcpmadd 27,18,15,27
+	fxcsmadd 28,18,15,28
+	fxcpmadd 29,19,15,29	; stfpsux 24,6,9
+	fxcsmadd 30,19,15,30	; stfpsux 25,6,9
+	fxcpmadd 31,16,7,31	; stfpsux 26,6,9
+
+	
+	# time steps 8-15
+	fxpmul	 24,20,0
+	fxsmul	 25,20,0
+	fxpmul	 26,21,0	; stfpsux 27,6,9
+	fxsmul	 27,21,0	; stfpsux 28,6,9
+	fxpmul	 28,22,0	; stfpsux 29,6,9
+	fxsmul	 29,22,0	; stfpsux 30,6,9
+	fxpmul	 30,23,0	; stfpsux 31,6,9
+	fxsmul	 31,23,0
+
+	# already loaded 8
+
+	fxcsmadd 24,23,9,24
+	# load 9
+	fxcpmadd 25,20,1,25
+	fxcsmadd 26,20,1,26
+	fxcpmadd 27,21,1,27
+	fxcsmadd 28,21,1,28
+	fxcpmadd 29,22,1,29
+	fxcsmadd 30,22,1,30
+	fxcpmadd 31,23,1,31
+
+	fxcpmadd 24,23,10,24
+	fxcsmadd 25,23,10,25
+	# load 10
+	fxcpmadd 26,20,2,26
+	fxcsmadd 27,20,2,27
+	fxcpmadd 28,21,2,28
+	fxcsmadd 29,21,2,29
+	fxcpmadd 30,22,2,30
+	fxcsmadd 31,22,2,31	; rlwinm 12,29,3,13,28
+
+	fxcsmadd 24,22,11,24	; lfpsx  9,11,12
+	fxcpmadd 25,23,11,25	; lhzux	 29,5,10
+	fxcsmadd 26,23,11,26
+	# load 11
+	fxcpmadd 27,20,3,27
+	fxcsmadd 28,20,3,28
+	fxcpmadd 29,21,3,29
+	fxcsmadd 30,21,3,30
+	fxcpmadd 31,22,3,31
+
+	fxcpmadd 24,22,12,24
+	fxcsmadd 25,22,12,25
+	fxcpmadd 26,23,12,26
+	fxcsmadd 27,23,12,27	; rlwinm 12,30,3,13,28
+	# load 12
+	fxcpmadd 28,20,4,28	; lfpsx  10,11,12
+	fxcsmadd 29,20,4,29	; lhzux	 30,5,10
+	fxcpmadd 30,21,4,30
+	fxcsmadd 31,21,4,31
+
+	fxcsmadd 24,21,13,24
+	fxcpmadd 25,22,13,25
+	fxcsmadd 26,22,13,26
+	fxcpmadd 27,23,13,27
+	fxcsmadd 28,23,13,28
+	# load 13
+	fxcpmadd 29,20,5,29
+	fxcsmadd 30,20,5,30
+	fxcpmadd 31,21,5,31	; rlwinm 12,31,3,13,28
+
+	fxcpmadd 24,21,14,24	; lfpsx  11,11,12
+	fxcsmadd 25,21,14,25	; lhzux	 31,5,10
+	fxcpmadd 26,22,14,26
+	fxcsmadd 27,22,14,27
+	fxcpmadd 28,23,14,28
+	fxcsmadd 29,23,14,29
+	# load 14
+	fxcpmadd 30,20,6,30
+	fxcsmadd 31,20,6,31
+
+	fxcsmadd 24,20,15,24
+	fxcpmadd 25,21,15,25
+	fxcsmadd 26,21,15,26
+	fxcpmadd 27,22,15,27	; rlwinm 12,28,3,13,28
+	fxcsmadd 28,22,15,28	; lfpsx  12,11,12
+	fxcpmadd 29,23,15,29	; lhzux	 28,5,10
+	fxcsmadd 30,23,15,30
+	# load 15
+	fxcpmadd 31,20,7,31
+
+	# after loads
+
+	fxcpmadd 24,16,8,24
+	fxcsmadd 25,16,8,25
+	fxcpmadd 26,17,8,26
+	fxcsmadd 27,17,8,27
+	fxcpmadd 28,18,8,28
+	fxcsmadd 29,18,8,29
+	fxcpmadd 30,19,8,30
+	fxcsmadd 31,19,8,31	; rlwinm 12,29,3,13,28
+
+	fxcsmadd 24,19,1,24	; lfpsx  13,11,12
+	fxcpmadd 25,16,9,25	; lhzux	 29,5,10
+	fxcsmadd 26,16,9,26
+	fxcpmadd 27,17,9,27
+	fxcsmadd 28,17,9,28
+	fxcpmadd 29,18,9,29
+	fxcsmadd 30,18,9,30
+	fxcpmadd 31,19,9,31
+
+	fxcpmadd 24,19,2,24
+	fxcsmadd 25,19,2,25
+	fxcpmadd 26,16,10,26
+	fxcsmadd 27,16,10,27	; rlwinm 12,30,3,13,28
+	fxcpmadd 28,17,10,28	; lfpsx  14,11,12
+	fxcsmadd 29,17,10,29	; lhzux	 30,5,10
+	fxcpmadd 30,18,10,30
+	fxcsmadd 31,18,10,31
+
+	fxcsmadd 24,18,3,24
+	fxcpmadd 25,19,3,25
+	fxcsmadd 26,19,3,26
+	fxcpmadd 27,16,11,27
+	fxcsmadd 28,16,11,28
+	fxcpmadd 29,17,11,29
+	fxcsmadd 30,17,11,30
+	fxcpmadd 31,18,11,31	; rlwinm 12,31,3,13,28
+
+	fxcpmadd 24,18,4,24	; lfpsx  15,11,12
+	fxcsmadd 25,18,4,25	; lhzux	 31,5,10
+	fxcpmadd 26,19,4,26
+	fxcsmadd 27,19,4,27
+	fxcpmadd 28,16,12,28
+	fxcsmadd 29,16,12,29
+	fxcpmadd 30,17,12,30
+	fxcsmadd 31,17,12,31
+	
+	fxcsmadd 24,17,5,24
+	fxcpmadd 25,18,5,25
+	fxcsmadd 26,18,5,26
+	fxcpmadd 27,19,5,27
+	fxcsmadd 28,19,5,28
+	fxcpmadd 29,16,13,29
+	fxcsmadd 30,16,13,30
+	fxcpmadd 31,17,13,31
+
+	fxcpmadd 24,17,6,24
+	fxcsmadd 25,17,6,25
+	fxcpmadd 26,18,6,26
+	fxcsmadd 27,18,6,27
+	fxcpmadd 28,19,6,28
+	fxcsmadd 29,19,6,29
+	fxcpmadd 30,16,14,30
+	fxcsmadd 31,16,14,31
+
+	fxcsmadd 24,16,7,24
+	fxcpmadd 25,17,7,25
+	fxcsmadd 26,17,7,26
+	fxcpmadd 27,18,7,27
+	fxcsmadd 28,18,7,28
+	fxcpmadd 29,19,7,29	; stfpsux 24,6,9
+	fxcsmadd 30,19,7,30	; stfpsux 25,6,9
+	fxcpmadd 31,16,15,31	; stfpsux 26,6,9
+
+	fxpmul	 24,20,8	# part of next loop
+	fxsmul	 25,20,8
+	fxpmul	 26,21,8	; stfpsux 27,6,9
+	fxsmul	 27,21,8	; stfpsux 28,6,9
+	fxpmul	 28,22,8	; stfpsux 29,6,9
+	fxsmul	 29,22,8	; stfpsux 30,6,9
+	fxpmul	 30,23,8	; stfpsux 31,6,9
+	fxsmul	 31,23,8
+	bdnz	loop
+
+#if 0
+	addi	3,3,-120	# store delay line
+	stfpsux	1,3,9
+	stfpsux	2,3,9
+	stfpsux	3,3,9
+	stfpsux	4,3,9
+	stfpsux	5,3,9
+	stfpsux	6,3,9
+	stfpsux	7,3,9
+	stfpsux	8,3,9
+	stfpsux	9,3,9
+	stfpsux	10,3,9
+	stfpsux	11,3,9
+	stfpsux	12,3,9
+	stfpsux	13,3,9
+	stfpsux	14,3,9
+	stfpsux	15,3,9
+#endif
+
+	li	9,DCOMPLEX_SIZE	# restore call-saved registers
+	lmw	28,0(1)	# restore r28 ... r31
+
+	#addi	1,1,16
+	lfpdux	31,1,9
+	lfpdux	30,1,9
+	lfpdux	29,1,9
+	lfpdux	28,1,9
+	lfpdux	27,1,9
+	lfpdux	26,1,9
+	lfpdux	25,1,9
+	lfpdux	24,1,9
+	lfpdux	23,1,9
+	lfpdux	22,1,9
+	lfpdux	21,1,9
+	lfpdux	20,1,9
+	lfpdux	19,1,9
+	lfpdux	18,1,9
+	lfpdux	17,1,9
+	lfpdux	16,1,9
+	lfpdux	15,1,9
+	lfpdux	14,1,9
+
+	addi	1,1,16		# restore stack pointer
+	blr			# return
+
+#elif 0 && NR_BITS_PER_SAMPLE == 16
+
+.align	5
+.comm	_FIR_fp_table,65536*FLOAT_SIZE,4
+
+.global	_filter
+_filter:
+#	filters all samples for one station, one polarization
+
+#	arguments:
+#	r3 :	  pointer to delay line (fcomplex[16])  NOTE: USE OF THE DELAY
+#		  LINE IS COMMENTED OUT!!!!!!!!!!!!!!!!!!!!
+#	r4 :	  pointer to weights line (const fcomplex[16])
+#	r5 :	  pointer to first sample (const i16complex[16*r7])
+#	r6 :	  pointer to result (fcomplex *)
+#	r7 :	  number of samples / 16
+
+#	internally used:
+#	r9 :	  8
+#	r10 :	  2048
+#	r11 :	  ptr to constant table
+#	r28-r31 : prefetched samples
+#	f0-f15 :  delay line (real in primary, imaginary in secondary unit)
+#	f16-f23 : weights (these are real values alternately stored in primary
+#			   and secondary units)
+#	f24-f29 : sums
+
+#	The implementation works on 5 or 6 time samples concurrently, to avoid
+#	stalls in the double hummer.  This unfortunately leads to totally
+#	incomprehensible code.  The loop processes 16 samples at a time.
+#	The input is converted from int16complex to dcomplex by black magic,
+#	making the code even harder to understand.
+
+	mtctr	7		# set number of iterations
+
+	li	9,-DCOMPLEX_SIZE
+	stfpdux 14,1,9		# save call-saved registers
+	stfpdux 15,1,9
+	stfpdux 16,1,9
+	stfpdux 17,1,9
+	stfpdux 18,1,9
+	stfpdux 19,1,9
+	stfpdux 20,1,9
+	stfpdux 21,1,9
+	stfpdux 22,1,9
+	stfpdux 23,1,9
+	stfpdux 24,1,9
+	stfpdux 25,1,9
+	stfpdux 26,1,9
+	stfpdux 27,1,9
+	stfpdux 28,1,9
+	stfpdux 29,1,9
+	stfpdux 30,1,9
+	stfpdux 31,1,9
+
+	subi	1,1,16
+	stmw	28,0(1)	# save r28 ... r31
+
+	lis	11,_FIR_fp_table@ha
+	li	9,FCOMPLEX_SIZE
+	la	11,_FIR_fp_table@l(11)
+	li	10,I16COMPLEX_SIZE*NR_SUBBAND_CHANNELS*NR_POLARIZATIONS
+	subi	6,6,FCOMPLEX_SIZE
+
+
+#if 0
+	lfpsx	1,0,3		# load delay line
+	lfpsux	2,3,9
+	lfpsux	3,3,9
+	lfpsux	4,3,9
+	lfpsux	5,3,9
+	lfpsux	6,3,9
+	lfpsux	7,3,9
+	lfpsux	8,3,9
+	lfpsux	9,3,9
+	lfpsux	10,3,9
+	lfpsux	11,3,9
+	lfpsux	12,3,9
+	lfpsux	13,3,9
+	lfpsux	14,3,9
+	lfpsux	15,3,9
+#endif
+
+	# convert 15 i16complex numbers to fcomplex
+	lwzx	29,0,5		# fetch FIR history samples
+	lwzux	30,5,10
+	lwzux	31,5,10
+	lwzux	28,5,10
+
+	rlwinm	12,29,2,14,29
+	lfssx	1,11,12
+	rlwinm	12,29,18,14,29
+	lfsx	1,11,12
+	lwzux	29,5,10
+
+	rlwinm	12,30,2,14,29
+	lfssx	2,11,12
+	rlwinm	12,30,18,14,29
+	lfsx	2,11,12
+	lwzux	30,5,10
+
+	rlwinm	12,31,2,14,29
+	lfssx	3,11,12
+	rlwinm	12,31,18,14,29
+	lfsx	3,11,12
+	lwzux	31,5,10
+
+	rlwinm	12,28,2,14,29
+	lfssx	4,11,12
+	rlwinm	12,28,18,14,29
+	lfsx	4,11,12
+	lwzux	28,5,10
+
+	rlwinm	12,29,2,14,29
+	lfssx	5,11,12
+	rlwinm	12,29,18,14,29
+	lfsx	5,11,12
+	lwzux	29,5,10
+
+	rlwinm	12,30,2,14,29
+	lfssx	6,11,12
+	rlwinm	12,30,18,14,29
+	lfsx	6,11,12
+	lwzux	30,5,10
+
+	rlwinm	12,31,2,14,29
+	lfssx	7,11,12
+	rlwinm	12,31,18,14,29
+	lfsx	7,11,12
+	lwzux	31,5,10
+
+	rlwinm	12,28,2,14,29
+	lfssx	8,11,12
+	rlwinm	12,28,18,14,29
+	lfsx	8,11,12
+	lwzux	28,5,10
+
+	rlwinm	12,29,2,14,29
+	lfssx	9,11,12
+	rlwinm	12,29,18,14,29
+	lfsx	9,11,12
+	lwzux	29,5,10
+
+	rlwinm	12,30,2,14,29
+	lfssx	10,11,12
+	rlwinm	12,30,18,14,29
+	lfsx	10,11,12
+	lwzux	30,5,10
+
+	rlwinm	12,31,2,14,29
+	lfssx	11,11,12
+	rlwinm	12,31,18,14,29
+	lfsx	11,11,12
+	lwzux	31,5,10
+
+	rlwinm	12,28,2,14,29
+	lfssx	12,11,12
+	rlwinm	12,28,18,14,29
+	lfsx	12,11,12
+	lwzux	28,5,10 # prefetch samples before entering loop
+
+	rlwinm	12,29,2,14,29
+	lfssx	13,11,12
+	rlwinm	12,29,18,14,29
+	lfsx	13,11,12
+	lwzux	29,5,10
+
+	rlwinm	12,30,2,14,29
+	lfssx	14,11,12
+	rlwinm	12,30,18,14,29
+	lfsx	14,11,12
+	lwzux	30,5,10
+
+	rlwinm	12,31,2,14,29
+	lfssx	15,11,12
+	rlwinm	12,31,18,14,29
+	lfsx	15,11,12
+	lwzux	31,5,10
+
+				; lfpsx	  16,0,4	# load weights line
+				; lfpsux  17,4,9
+				; lfpsux  18,4,9
+				; lfpsux  19,4,9
+				; lfpsux  20,4,9
+				; lfpsux  21,4,9
+				; lfpsux  22,4,9
+				; lfpsux  23,4,9
+
+	# essentially part of loop, but interleaved with
+	# the stores in the tail of the loop
+	fxpmul	 24,20,8
+	fxsmul	 25,20,8
+	fxpmul	 26,21,8
+	fxsmul	 27,21,8
+	fxpmul	 28,22,8
+	fxsmul	 29,22,8
+	fxpmul	 30,23,8
+	fxsmul	 31,23,8
+
+loop:
+	# time steps 0-7
+
+	# load 0
 
-	fxcsmadd 24,23,12,24	; lfpsx	 12,11,28
-	fxcpmadd 25,20,4,25	; lbzux	 28,5,10
-	fxcsmadd 26,20,4,26
-	fxcpmadd 27,21,4,27
-	fxcsmadd 28,21,4,28
+	fxcsmadd 24,23,1,24	; rlwinm 12,28,2,14,29
+	# load 1
+	fxcpmadd 25,20,9,25	; lfssx  0,11,12
+	fxcsmadd 26,20,9,26	; rlwinm 12,28,18,14,29
+	fxcpmadd 27,21,9,27	; lfsx	 0,11,12
+	fxcsmadd 28,21,9,28	; lwzux	 28,5,10
+	fxcpmadd 29,22,9,29
+	fxcsmadd 30,22,9,30
+	fxcpmadd 31,23,9,31
+
+	fxcpmadd 24,23,2,24	; rlwinm 12,29,2,14,29
+	fxcsmadd 25,23,2,25	; lfssx  1,11,12
+	# load 2
+	fxcpmadd 26,20,10,26	; rlwinm 12,29,18,14,29
+	fxcsmadd 27,20,10,27	; lfsx	 1,11,12
+	fxcpmadd 28,21,10,28	; lwzux	 29,5,10
+	fxcsmadd 29,21,10,29
+	fxcpmadd 30,22,10,30
+	fxcsmadd 31,22,10,31
+
+	fxcsmadd 24,22,3,24	; rlwinm 12,30,2,14,29
+	fxcpmadd 25,23,3,25	; lfssx  2,11,12
+	fxcsmadd 26,23,3,26	; rlwinm 12,30,18,14,29
+	# load 3
+	fxcpmadd 27,20,11,27	; lfsx	 2,11,12
+	fxcsmadd 28,20,11,28	; lwzux	 30,5,10
+	fxcpmadd 29,21,11,29
+	fxcsmadd 30,21,11,30
+	fxcpmadd 31,22,11,31
+
+	fxcpmadd 24,22,4,24	; rlwinm 12,31,2,14,29
+	fxcsmadd 25,22,4,25	; lfssx  3,11,12
+	fxcpmadd 26,23,4,26	; rlwinm 12,31,18,14,29
+	fxcsmadd 27,23,4,27	; lfsx   3,11,12
+	# load 4
+	fxcpmadd 28,20,12,28	; lwzux	 31,5,10
+	fxcsmadd 29,20,12,29
+	fxcpmadd 30,21,12,30
+	fxcsmadd 31,21,12,31
 
-	fxcsmadd 24,19,4,24
-	fxcpmadd 25,16,12,25
-	fxcsmadd 26,16,12,26
-	fxcpmadd 27,17,12,27
-	fxcsmadd 28,17,12,28
+	fxcsmadd 24,21,5,24
+	fxcpmadd 25,22,5,25
+	fxcsmadd 26,22,5,26
+	fxcpmadd 27,23,5,27
+	fxcsmadd 28,23,5,28
+	# load 5
+	fxcpmadd 29,20,13,29
+	fxcsmadd 30,20,13,30
+	fxcpmadd 31,21,13,31
 
-	fxcpmadd 24,23,13,24	; slwi	 29,29,3
-	fxcsmadd 25,23,13,25	; lfpsx	 13,11,29
-	fxcpmadd 26,20,5,26	; lbzux	 29,5,10
-	fxcsmadd 27,20,5,27
-	fxcpmadd 28,21,5,28
-	
-	fxcsmadd 24,17,8,24
-	fxcpmadd 25,18,8,25
-	fxcsmadd 26,18,8,26
-	fxcpmadd 27,19,8,27
-	fxcsmadd 28,19,8,28
+	fxcpmadd 24,21,6,24
+	fxcsmadd 25,21,6,25
+	fxcpmadd 26,22,6,26
+	fxcsmadd 27,22,6,27
+	fxcpmadd 28,23,6,28
+	fxcsmadd 29,23,6,29
+	# load 6
+	fxcpmadd 30,20,14,30
+	fxcsmadd 31,20,14,31
 
-	fxcpmadd 24,19,5,24
-	fxcsmadd 25,19,5,25
-	fxcpmadd 26,16,13,26
-	fxcsmadd 27,16,13,27
-	fxcpmadd 28,17,13,28
+	fxcsmadd 24,20,7,24
+	fxcpmadd 25,21,7,25
+	fxcsmadd 26,21,7,26
+	fxcpmadd 27,22,7,27	; rlwinm 12,28,2,14,29
+	fxcsmadd 28,22,7,28	; lfssx	 4,11,12
+	fxcpmadd 29,23,7,29
+	fxcsmadd 30,23,7,30
+	# load 7
+	fxcpmadd 31,20,15,31
 
-	fxcsmadd 24,22,14,24
-	fxcpmadd 25,23,14,25	; slwi	 30,30,3
-	fxcsmadd 26,23,14,26	; lfpsx	 14,11,30
-	fxcpmadd 27,20,6,27	; lbzux	 30,5,10
-	fxcsmadd 28,20,6,28
-	
-	fxcsmadd 24,18,6,24
-	fxcpmadd 25,19,6,25
-	fxcsmadd 26,19,6,26
-	fxcpmadd 27,16,14,27
-	fxcsmadd 28,16,14,28
+	# after loads
+	fxcpmadd 24,16,0,24	; rlwinm 12,28,18,14,29
+	fxcsmadd 25,16,0,25	; lfsx   4,11,12
+	fxcpmadd 26,17,0,26
+	fxcsmadd 27,17,0,27	; lwzux	 28,5,10
+	fxcpmadd 28,18,0,28
+	fxcsmadd 29,18,0,29
+	fxcpmadd 30,19,0,30
+	fxcsmadd 31,19,0,31	; rlwinm 12,29,2,14,29
 
-	fxcpmadd 24,21,1,24
-	fxcsmadd 25,21,1,25
-	fxcpmadd 26,22,1,26
-	fxcsmadd 27,22,1,27
-	fxcpmadd 28,23,1,28
+	fxcsmadd 24,19,9,24	; lfssx  5,11,12
+	fxcpmadd 25,16,1,25
+	fxcsmadd 26,16,1,26
+	fxcpmadd 27,17,1,27
+	fxcsmadd 28,17,1,28	; rlwinm 12,29,18,14,29
+	fxcpmadd 29,18,1,29	; lfsx   5,11,12
+	fxcsmadd 30,18,1,30
+	fxcpmadd 31,19,1,31	; lwzux	 29,5,10
 
-	fxcpmadd 24,22,15,24
-	fxcsmadd 25,22,15,25
-	fxcpmadd 26,23,15,26	; slwi	 31,31,3
-	fxcsmadd 27,23,15,27	; lfpsx	 15,11,31
-	fxcpmadd 28,20,7,28	; lbzux	 31,5,10
+	fxcpmadd 24,19,10,24
+	fxcsmadd 25,19,10,25
+	fxcpmadd 26,16,2,26
+	fxcsmadd 27,16,2,27	; rlwinm 12,30,2,14,29
+	fxcpmadd 28,17,2,28	; lfssx  6,11,12
+	fxcsmadd 29,17,2,29
+	fxcpmadd 30,18,2,30
+	fxcsmadd 31,18,2,31
 
-	fxcpmadd 24,18,7,24
-	fxcsmadd 25,18,7,25
-	fxcpmadd 26,19,7,26
-	fxcsmadd 27,19,7,27
-	fxcpmadd 28,16,15,28
+	fxcsmadd 24,18,11,24	; rlwinm 12,30,18,14,29
+	fxcpmadd 25,19,11,25	; lfsx   6,11,12
+	fxcsmadd 26,19,11,26
+	fxcpmadd 27,16,3,27	; lwzux	 30,5,10
+	fxcsmadd 28,16,3,28
+	fxcpmadd 29,17,3,29
+	fxcsmadd 30,17,3,30
+	fxcpmadd 31,18,3,31	; rlwinm 12,31,2,14,29
+	
+	fxcpmadd 24,18,12,24	; lfssx  7,11,12
+	fxcsmadd 25,18,12,25
+	fxcpmadd 26,19,12,26
+	fxcsmadd 27,19,12,27
+	fxcpmadd 28,16,4,28	; rlwinm 12,31,18,14,29
+	fxcsmadd 29,16,4,29	; lfsx   7,11,12
+	fxcpmadd 30,17,4,30
+	fxcsmadd 31,17,4,31	; lwzux	 31,5,10
 
-	fxpmul	 29,19,15	; stfpsux 24,6,9
-	fxsmul	 24,16,15	; stfpsux 25,6,9
-	fxpmul	 25,17,15	; stfpsux 26,6,9
-	fxsmul	 26,17,15	; stfpsux 27,6,9
-	fxpmul	 27,18,15	; stfpsux 28,6,9
-	fxsmul	 28,18,15
+	fxcsmadd 24,17,13,24
+	fxcpmadd 25,18,13,25
+	fxcsmadd 26,18,13,26
+	fxcpmadd 27,19,13,27
+	fxcsmadd 28,19,13,28
+	fxcpmadd 29,16,5,29
+	fxcsmadd 30,16,5,30
+	fxcpmadd 31,17,5,31	; rlwinm 12,28,2,14,29
+
+	fxcpmadd 24,17,14,24	; lfssx  8,11,12
+	fxcsmadd 25,17,14,25
+	fxcpmadd 26,18,14,26
+	fxcsmadd 27,18,14,27
+	fxcpmadd 28,19,14,28	; rlwinm 12,28,18,14,29
+	fxcsmadd 29,19,14,29	; lfsx   8,11,12
+	fxcpmadd 30,16,6,30
+	fxcsmadd 31,16,6,31	; lwzux	 28,5,10
+
+	fxcsmadd 24,16,15,24
+	fxcpmadd 25,17,15,25
+	fxcsmadd 26,17,15,26
+	fxcpmadd 27,18,15,27
+	fxcsmadd 28,18,15,28
+	fxcpmadd 29,19,15,29	; stfpsux 24,6,9
+	fxcsmadd 30,19,15,30	; stfpsux 25,6,9
+	fxcpmadd 31,16,7,31	; stfpsux 26,6,9
 
+	
+	# time steps 8-15
+	fxpmul	 24,20,0
+	fxsmul	 25,20,0
+	fxpmul	 26,21,0	; stfpsux 27,6,9
+	fxsmul	 27,21,0	; stfpsux 28,6,9
+	fxpmul	 28,22,0	; stfpsux 29,6,9
+	fxsmul	 29,22,0	; stfpsux 30,6,9
+	fxpmul	 30,23,0	; stfpsux 31,6,9
+	fxsmul	 31,23,0
+
+	# already loaded 8
+
+	fxcsmadd 24,23,9,24
+	# load 9
+	fxcpmadd 25,20,1,25
+	fxcsmadd 26,20,1,26
+	fxcpmadd 27,21,1,27
+	fxcsmadd 28,21,1,28
+	fxcpmadd 29,22,1,29
+	fxcsmadd 30,22,1,30
+	fxcpmadd 31,23,1,31
+
+	fxcpmadd 24,23,10,24
+	fxcsmadd 25,23,10,25
+	# load 10
+	fxcpmadd 26,20,2,26
+	fxcsmadd 27,20,2,27
+	fxcpmadd 28,21,2,28
+	fxcsmadd 29,21,2,29
+	fxcpmadd 30,22,2,30
+	fxcsmadd 31,22,2,31	; rlwinm 12,29,2,14,29
+
+	fxcsmadd 24,22,11,24	; lfssx  9,11,12
+	fxcpmadd 25,23,11,25
+	fxcsmadd 26,23,11,26
+	# load 11
+	fxcpmadd 27,20,3,27
+	fxcsmadd 28,20,3,28	; rlwinm 12,29,18,14,29
+	fxcpmadd 29,21,3,29	; lfsx   9,11,12
+	fxcsmadd 30,21,3,30
+	fxcpmadd 31,22,3,31	; lwzux	 29,5,10
+
+	fxcpmadd 24,22,12,24
+	fxcsmadd 25,22,12,25
+	fxcpmadd 26,23,12,26
+	fxcsmadd 27,23,12,27	; rlwinm 12,30,2,14,29
+	# load 12
+	fxcpmadd 28,20,4,28	; lfssx  10,11,12
+	fxcsmadd 29,20,4,29
+	fxcpmadd 30,21,4,30
+	fxcsmadd 31,21,4,31
+
+	fxcsmadd 24,21,13,24	; rlwinm 12,30,18,14,29
+	fxcpmadd 25,22,13,25	; lfsx   10,11,12
+	fxcsmadd 26,22,13,26
+	fxcpmadd 27,23,13,27	; lwzux	 30,5,10
+	fxcsmadd 28,23,13,28
+	# load 13
+	fxcpmadd 29,20,5,29
+	fxcsmadd 30,20,5,30
+	fxcpmadd 31,21,5,31	; rlwinm 12,31,2,14,29
+
+	fxcpmadd 24,21,14,24	; lfssx  11,11,12
+	fxcsmadd 25,21,14,25
+	fxcpmadd 26,22,14,26
+	fxcsmadd 27,22,14,27
+	fxcpmadd 28,23,14,28	; rlwinm 12,31,18,14,29
+	fxcsmadd 29,23,14,29	; lfsx   11,11,12
+	# load 14
+	fxcpmadd 30,20,6,30
+	fxcsmadd 31,20,6,31	; lwzux	 31,5,10
+
+	fxcsmadd 24,20,15,24
+	fxcpmadd 25,21,15,25
+	fxcsmadd 26,21,15,26
+	fxcpmadd 27,22,15,27	; rlwinm 12,28,2,14,29
+	fxcsmadd 28,22,15,28	; lfssx  12,11,12
+	fxcpmadd 29,23,15,29
+	fxcsmadd 30,23,15,30
+	# load 15
+	fxcpmadd 31,20,7,31
+
+	# after loads
+
+	fxcpmadd 24,16,8,24	; rlwinm 12,28,18,14,29
+	fxcsmadd 25,16,8,25	; lfsx   12,11,12
+	fxcpmadd 26,17,8,26
+	fxcsmadd 27,17,8,27	; lwzux	 28,5,10
+	fxcpmadd 28,18,8,28
+	fxcsmadd 29,18,8,29
+	fxcpmadd 30,19,8,30
+	fxcsmadd 31,19,8,31	; rlwinm 12,29,2,14,29
+
+	fxcsmadd 24,19,1,24	; lfssx  13,11,12
+	fxcpmadd 25,16,9,25
+	fxcsmadd 26,16,9,26
+	fxcpmadd 27,17,9,27
+	fxcsmadd 28,17,9,28	; rlwinm 12,29,18,14,29
+	fxcpmadd 29,18,9,29	; lfsx   13,11,12
+	fxcsmadd 30,18,9,30
+	fxcpmadd 31,19,9,31	; lwzux	 29,5,10
+
+	fxcpmadd 24,19,2,24
+	fxcsmadd 25,19,2,25
+	fxcpmadd 26,16,10,26
+	fxcsmadd 27,16,10,27	; rlwinm 12,30,2,14,29
+	fxcpmadd 28,17,10,28	; lfssx  14,11,12
+	fxcsmadd 29,17,10,29
+	fxcpmadd 30,18,10,30
+	fxcsmadd 31,18,10,31
+
+	fxcsmadd 24,18,3,24	; rlwinm 12,30,18,14,29
+	fxcpmadd 25,19,3,25	; lfsx   14,11,12
+	fxcsmadd 26,19,3,26
+	fxcpmadd 27,16,11,27	; lwzux	 30,5,10
+	fxcsmadd 28,16,11,28
+	fxcpmadd 29,17,11,29
+	fxcsmadd 30,17,11,30
+	fxcpmadd 31,18,11,31	; rlwinm 12,31,2,14,29
+
+	fxcpmadd 24,18,4,24	; lfssx  15,11,12
+	fxcsmadd 25,18,4,25
+	fxcpmadd 26,19,4,26
+	fxcsmadd 27,19,4,27
+	fxcpmadd 28,16,12,28	; rlwinm 12,31,18,14,29
+	fxcsmadd 29,16,12,29	; lfsx   15,11,12
+	fxcpmadd 30,17,12,30
+	fxcsmadd 31,17,12,31	; lwzux	 31,5,10
+	
+	fxcsmadd 24,17,5,24
+	fxcpmadd 25,18,5,25
+	fxcsmadd 26,18,5,26
+	fxcpmadd 27,19,5,27
+	fxcsmadd 28,19,5,28
+	fxcpmadd 29,16,13,29
+	fxcsmadd 30,16,13,30
+	fxcpmadd 31,17,13,31
+
+	fxcpmadd 24,17,6,24
+	fxcsmadd 25,17,6,25
+	fxcpmadd 26,18,6,26
+	fxcsmadd 27,18,6,27
+	fxcpmadd 28,19,6,28
+	fxcsmadd 29,19,6,29
+	fxcpmadd 30,16,14,30
+	fxcsmadd 31,16,14,31
+
+	fxcsmadd 24,16,7,24
+	fxcpmadd 25,17,7,25
+	fxcsmadd 26,17,7,26
+	fxcpmadd 27,18,7,27
+	fxcsmadd 28,18,7,28
+	fxcpmadd 29,19,7,29	; stfpsux 24,6,9
+	fxcsmadd 30,19,7,30	; stfpsux 25,6,9
+	fxcpmadd 31,16,15,31	; stfpsux 26,6,9
+
+	fxpmul	 24,20,8	# part of next loop
+	fxsmul	 25,20,8
+	fxpmul	 26,21,8	; stfpsux 27,6,9
+	fxsmul	 27,21,8	; stfpsux 28,6,9
+	fxpmul	 28,22,8	; stfpsux 29,6,9
+	fxsmul	 29,22,8	; stfpsux 30,6,9
+	fxpmul	 30,23,8	; stfpsux 31,6,9
+	fxsmul	 31,23,8
 	bdnz	loop
 
-	la	8,16(1)	# restore call-saved registers
-	li	9,DCOMPLEX_SIZE
+#if 0
+	addi	3,3,-120	# store delay line
+	stfpsux	1,3,9
+	stfpsux	2,3,9
+	stfpsux	3,3,9
+	stfpsux	4,3,9
+	stfpsux	5,3,9
+	stfpsux	6,3,9
+	stfpsux	7,3,9
+	stfpsux	8,3,9
+	stfpsux	9,3,9
+	stfpsux	10,3,9
+	stfpsux	11,3,9
+	stfpsux	12,3,9
+	stfpsux	13,3,9
+	stfpsux	14,3,9
+	stfpsux	15,3,9
+#endif
 
-	lfpdx	14,0,8
-	lfpdux	15,8,9
-	lfpdux	16,8,9
-	lfpdux	17,8,9
-	lfpdux	18,8,9
-	lfpdux	19,8,9
-	lfpdux	20,8,9
-	lfpdux	21,8,9
-	lfpdux	22,8,9
-	lfpdux	23,8,9
-	lfpdux	24,8,9
-	lfpdux	25,8,9
-	lfpdux	26,8,9
-	lfpdux	27,8,9
-	lfpdux	28,8,9
-	lfpdux	29,8,9
-	lmw	28,0(1)		# restore r28 ... r31
+	li	9,DCOMPLEX_SIZE	# restore call-saved registers
+	lmw	28,0(1)	# restore r28 ... r31
 
-	addi	1,1,272		# restore stack pointer
+	#addi	1,1,16
+	lfpdux	31,1,9
+	lfpdux	30,1,9
+	lfpdux	29,1,9
+	lfpdux	28,1,9
+	lfpdux	27,1,9
+	lfpdux	26,1,9
+	lfpdux	25,1,9
+	lfpdux	24,1,9
+	lfpdux	23,1,9
+	lfpdux	22,1,9
+	lfpdux	21,1,9
+	lfpdux	20,1,9
+	lfpdux	19,1,9
+	lfpdux	18,1,9
+	lfpdux	17,1,9
+	lfpdux	16,1,9
+	lfpdux	15,1,9
+	lfpdux	14,1,9
+
+	addi	1,1,16		# restore stack pointer
 	blr			# return
 
-#elif INPUT_TYPE == I16COMPLEX_TYPE
+#elif NR_BITS_PER_SAMPLE == 16
 
 .align	5
 sub_value:
@@ -1158,116 +1992,118 @@ _filter:
 loop:
 	# time steps 0-5
 
-	fxcpmadd 24,17,14,24
-	fxcsmadd 25,17,14,25	; xor	 28,28,12
-	fxcpmadd 26,18,14,26	; sthbrx 28,7,1
-	fxcsmadd 27,18,14,27
-	fxcpmadd 28,19,14,28
-	fxcsmadd 29,19,14,29	; srawi	 28,28,16
+	fxcsmadd 24,23,1,24	; xor	 28,28,12
+	fxcpmadd 25,20,9,25	; sthbrx 28,7,1
+	fxcsmadd 26,20,9,26	; srawi	 28,28,16
+	fxcpmadd 27,21,9,27	; sthbrx 28,11,1
+	fxcsmadd 28,21,9,28
+	fxcpmadd 29,22,9,29
 
-	fxcpmadd 24,21,6,24	; sthbrx 28,11,1
-	fxcsmadd 25,21,6,25
-	fxcpmadd 26,22,6,26
-	fxcsmadd 27,22,6,27	; lfpdx	 30,8,1
-	fxcpmadd 28,23,6,28
-	fxcsmadd 29,23,6,29	; lwzux	 28,5,10
+	fxcpmadd 24,23,2,24
+	fxcsmadd 25,23,2,25	; lfpdx	 30,8,1
+	fxcpmadd 26,20,10,26
+	fxcsmadd 27,20,10,27
+	fxcpmadd 28,21,10,28
+	fxcsmadd 29,21,10,29	; lwzux	 28,5,10
 
-	fxcpmadd 24,20,8,24
-	fxcsmadd 25,20,8,25
-	fpsub	0,30,31
-	fxcpmadd 26,21,8,26	; xor	 29,29,12
-	fxcsmadd 27,21,8,27	; sthbrx 29,7,1
-	fxcpmadd 28,22,8,28
-	fxcsmadd 29,22,8,29
+	fxcsmadd 24,22,3,24
+	fxcpmadd 25,23,3,25
+	fxcsmadd 26,23,3,26
+	fxcpmadd 27,20,11,27	; xor	 29,29,12
+	fxcsmadd 28,20,11,28	; sthbrx 29,7,1
+	fxcpmadd 29,21,11,29	; srawi	 29,29,16
 
-	fxcpmadd 24,16,0,24	; srawi	 29,29,16
-	fxcsmadd 25,16,0,25	; sthbrx 29,11,1
-	fxcpmadd 26,17,0,26
-	fxcsmadd 27,17,0,27
-	fxcpmadd 28,18,0,28	; lfpdx	 30,8,1
-	fxcsmadd 29,18,0,29
+	fxcpmadd 24,22,4,24	; sthbrx 29,11,1
+	fxcsmadd 25,22,4,25
+	fxcpmadd 26,23,4,26
+	fxcsmadd 27,23,4,27
+	fpsub	0,30,31	
+	fxcpmadd 28,20,12,28	; lfpdx	 30,8,1
+	fxcsmadd 29,20,12,29
 
-	fxcsmadd 24,23,1,24	; lwzux	 29,5,10
-	fxcpmadd 25,20,9,25
-	fxcsmadd 26,20,9,26
-	fpsub	1,30,31
-	fxcpmadd 27,21,9,27	; xor	 30,30,12
-	fxcsmadd 28,21,9,28	; sthbrx 30,7,1
-	fxcpmadd 29,22,9,29
+	fxcsmadd 24,21,5,24
+	fxcpmadd 25,22,5,25
+	fxcsmadd 26,22,5,26	; lwzux	 29,5,10
+	fxcpmadd 27,23,5,27
+	fxcsmadd 28,23,5,28
+	fxcpmadd 29,20,13,29
 
-	fxcsmadd 24,19,9,24
-	fxcpmadd 25,16,1,25	; srawi	 30,30,16
-	fxcsmadd 26,16,1,26	; sthbrx 30,11,1
-	fxcpmadd 27,17,1,27
-	fxcsmadd 28,17,1,28
-	fxcpmadd 29,18,1,29	; lfpdx	 30,8,1
+	fxcpmadd 24,21,6,24	; xor	 30,30,12
+	fxcsmadd 25,21,6,25	; sthbrx 30,7,1
+	fxcpmadd 26,22,6,26	; srawi	 30,30,16
+	fxcsmadd 27,22,6,27	; sthbrx 30,11,1
+	fxcpmadd 28,23,6,28
+	fxcsmadd 29,23,6,29
 
-	fxcpmadd 24,23,2,24
-	fxcsmadd 25,23,2,25	; lwzux	 30,5,10
-	fxcpmadd 26,20,10,26
-	fxcsmadd 27,20,10,27
-	fpsub	2,30,31
-	fxcpmadd 28,21,10,28	; xor	 31,31,12
-	fxcsmadd 29,21,10,29	; sthbrx 31,7,1
-	
 	fxcsmadd 24,20,7,24
 	fxcpmadd 25,21,7,25
-	fxcsmadd 26,21,7,26
+	fpsub	1,30,31	
+	fxcsmadd 26,21,7,26	; lfpdx	 30,8,1
 	fxcpmadd 27,22,7,27
 	fxcsmadd 28,22,7,28
-	fxcpmadd 29,23,7,29
+	fxcpmadd 29,23,7,29	; lwzux	 30,5,10
+
+	fxcpmadd 24,20,8,24
+	fxcsmadd 25,20,8,25
+	fxcpmadd 26,21,8,26
+	fxcsmadd 27,21,8,27
+	fxcpmadd 28,22,8,28
+	fxcsmadd 29,22,8,29	; xor	 31,31,12
+
+	fxcpmadd 24,17,14,24	; sthbrx 31,7,1
+	fxcsmadd 25,17,14,25
+	fxcpmadd 26,18,14,26
+	fxcsmadd 27,18,14,27	; srawi	 31,31,16
+	fxcpmadd 28,19,14,28	; sthbrx 31,11,1
+	fxcsmadd 29,19,14,29
+
+	fxcpmadd 24,16,0,24
+	fxcsmadd 25,16,0,25
+	fpsub	2,30,31	
+	fxcpmadd 26,17,0,26	; lfpdx	 30,8,1
+	fxcsmadd 27,17,0,27
+	fxcpmadd 28,18,0,28
+	fxcsmadd 29,18,0,29	; lwzux	 31,5,10
+
+	fpsub	3,30,31		; xor	 28,28,12
+	fxcsmadd 24,19,9,24	; sthbrx 28,7,1
+	fxcpmadd 25,16,1,25
+	fxcsmadd 26,16,1,26
+	fxcpmadd 27,17,1,27	; srawi	 28,28,16
+	fxcsmadd 28,17,1,28	; sthbrx 28,11,1
+	fxcpmadd 29,18,1,29
 
 	fxcpmadd 24,19,10,24
 	fxcsmadd 25,19,10,25
-	fxcpmadd 26,16,2,26	; srawi	 31,31,16
-	fxcsmadd 27,16,2,27	; sthbrx 31,11,1
+	fxcpmadd 26,16,2,26	; lfpdx	 30,8,1
+	fxcsmadd 27,16,2,27
 	fxcpmadd 28,17,2,28
-	fxcsmadd 29,17,2,29
-
-	fxcsmadd 24,22,3,24	; lfpdx	 30,8,1
-	fxcpmadd 25,23,3,25
-	fxcsmadd 26,23,3,26	; lwzux	 31,5,10
-	fxcpmadd 27,20,11,27
-	fxcsmadd 28,20,11,28
-	fpsub	3,30,31
-	fxcpmadd 29,21,11,29	; xor	 28,28,12
+	fxcsmadd 29,17,2,29	; lwzux	 28,5,10
 
-	fxcsmadd 24,18,11,24	; sthbrx 28,7,1
+	fpsub	4,30,31		; xor	 29,29,12
+	fxcsmadd 24,18,11,24	; sthbrx 29,7,1
 	fxcpmadd 25,19,11,25
 	fxcsmadd 26,19,11,26
-	fxcpmadd 27,16,3,27	; srawi	 28,28,16
-	fxcsmadd 28,16,3,28	; sthbrx 28,11,1
+	fxcpmadd 27,16,3,27	; srawi	 29,29,16
+	fxcsmadd 28,16,3,28	; sthbrx 29,11,1
 	fxcpmadd 29,17,3,29
 
-	fxcpmadd 24,22,4,24
-	fxcsmadd 25,22,4,25	; lfpdx	 30,8,1
-	fxcpmadd 26,23,4,26
-	fxcsmadd 27,23,4,27	; lwzux	 28,5,10
-	fxcpmadd 28,20,12,28
-	fxcsmadd 29,20,12,29
-
-	fpsub	4,30,31
-	fxcpmadd 24,18,12,24	; xor	 29,29,12
-	fxcsmadd 25,18,12,25	; sthbrx 29,7,1
-	fxcpmadd 26,19,12,26
+	fxcpmadd 24,18,12,24
+	fxcsmadd 25,18,12,25
+	fxcpmadd 26,19,12,26	; lfpdx	 30,8,1
 	fxcsmadd 27,19,12,27
-	fxcpmadd 28,16,4,28	; srawi	 29,29,16
-	fxcsmadd 29,16,4,29	; sthbrx 29,11,1
-
-	fxcsmadd 24,21,5,24
-	fxcpmadd 25,22,5,25
-	fxcsmadd 26,22,5,26	; lfpdx	 30,8,1
-	fxcpmadd 27,23,5,27
-	fxcsmadd 28,23,5,28	; lwzux	 29,5,10
-	fxcpmadd 29,20,13,29
+	fxcpmadd 28,16,4,28
+	fxcsmadd 29,16,4,29	; lwzux	 29,5,10
 
-	fxcsmadd 24,17,13,24
 	fpsub	5,30,31
+	fxcsmadd 24,17,13,24
 	fxcpmadd 25,18,13,25
 	fxcsmadd 26,18,13,26
 	fxcpmadd 27,19,13,27
-	fxcsmadd 28,19,13,28	; stfpsux 24,6,9
-	fxcpmadd 29,16,5,29	; stfpsux 25,6,9
+	fxcsmadd 28,19,13,28
+	fxcpmadd 29,16,5,29
+		; stfpsux 24,6,9
+		; stfpsux 25,6,9
 
 
 	# time steps 6-10
@@ -1530,9 +2366,8 @@ loop:
 
 	addi	1,1,16		# restore stack pointer
 	blr			# return
-
 #else
-#error INPUT_TYPE not supported
+#error unsupported NR_BITS_PER_SAMPLE
 #endif
 
 #if 0
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h b/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h
index c40ce8e59ffd298c7800dd947e6768db651f9cfb..a5f0eaf25fc09cca82f78ea7d5abb569f75386c1 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h
@@ -61,7 +61,7 @@ extern "C" {
   void _prefetch(const void *src, size_t count, size_t stride);
 
   extern struct {
-    unsigned input_type;
+    unsigned nr_bits_per_sample;
     unsigned nr_subband_channels;
     unsigned nr_taps;
     unsigned nr_polarizations;
@@ -72,6 +72,14 @@ extern "C" {
 #endif
 
   unsigned long long _rdtsc();
+
+#if NR_BITS_PER_SAMPLE == 4
+  extern fcomplex _FIR_fp_table[16][16];
+#elif NR_BITS_PER_SAMPLE == 8
+  extern fcomplex _FIR_fp_table[256][256];
+#elif NR_BITS_PER_SAMPLE == 16
+  extern float _FIR_fp_table[65536];
+#endif
 };
 
 } // namespace CS1
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc b/Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc
index de4ea11b9f7c87c854e272809583209f1b4e09ad..4851bd28118dc434587e0b10908f31f62225cd0e 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc
@@ -6,6 +6,7 @@
 #include <FFT_Asm.h>
 #include <FIR_Asm.h>
 
+#include <Common/DataConvert.h>
 #include <Common/Timer.h>
 
 #include <complex>
@@ -57,6 +58,25 @@ PPF::PPF(unsigned nrStations, unsigned nrSamplesPerIntegration, double channelBa
 #endif
 {
   init_fft();
+
+#if !defined PPF_C_IMPLEMENTATION
+#if NR_BITS_PER_SAMPLE == 4
+  static const float map[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, -8, -7, -6, -5, -4, -3, -2, -1,
+  };
+
+  for (unsigned i = 0; i < 16; i ++)
+    for (unsigned j = 0; j < 16; j ++)
+      _FIR_fp_table[i][j] = makefcomplex(map[j], map[i]);
+#elif NR_BITS_PER_SAMPLE == 8
+  for (unsigned i = 0; i < 256; i ++)
+    for (unsigned j = 0; j < 256; j ++)
+      _FIR_fp_table[i][j] = makefcomplex((float) (signed char) i, (float) (signed char) j);
+#elif 0 && NR_BITS_PER_SAMPLE == 16
+  for (unsigned i = 0; i < 65536; i ++)
+    _FIR_fp_table[i] = (float) byteSwap((signed short) i);
+#endif
+#endif
 }
 
 
diff --git a/Appl/CEP/CS1/CS1_BGLProc/test/tBGL_Processing.cc b/Appl/CEP/CS1/CS1_BGLProc/test/tBGL_Processing.cc
index 8d82969b78c3302eb5491a5dca7cdd957f3498c0..d86e62629f274112827f8c0bf7b122164bc42dd0 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/test/tBGL_Processing.cc
+++ b/Appl/CEP/CS1/CS1_BGLProc/test/tBGL_Processing.cc
@@ -52,12 +52,14 @@ inline TransposedData::SampleType toComplex(double phi)
     double s, c;
 
     sincos(phi, &s, &c);
-#if INPUT_TYPE == I4COMPLEX_TYPE
+#if NR_BITS_PER_SAMPLE == 4
     return makei4complex((int) rint(7 * c), (int) rint(7 * s));
-#elif INPUT_TYPE == I16COMPLEX_TYPE
+#elif NR_BITS_PER_SAMPLE == 8
+    return makei8complex((int) rint(127 * c), (int) rint(127 * s));
+#elif NR_BITS_PER_SAMPLE == 16
     return makei16complex((int) rint(32767 * c), (int) rint(32767 * s));
 #else
-#error Unknown INPUT_TYPE
+#error Unknown NR_BITS_PER_SAMPLE
 #endif
 }
 
diff --git a/Appl/CEP/CS1/CS1_Interface/include/CS1_Interface/CS1_Config.h b/Appl/CEP/CS1/CS1_Interface/include/CS1_Interface/CS1_Config.h
index c525af071499ab76ed2b8460b618b3cf8042e104..00cc141693fe9e79e01788fdcd79902a1772cb91 100644
--- a/Appl/CEP/CS1/CS1_Interface/include/CS1_Interface/CS1_Config.h
+++ b/Appl/CEP/CS1/CS1_Interface/include/CS1_Interface/CS1_Config.h
@@ -25,22 +25,21 @@
 
 /* This is included by C++ and assembly files.  Do not put anything but
    constants here! */
-#define INPUT_TYPE		   I16COMPLEX_TYPE
+#define NR_BITS_PER_SAMPLE	   16
 #define NR_POLARIZATIONS	   2
 #define NR_SUBBAND_CHANNELS	   256
 #define NR_TAPS			   16
 
 /* Do not change anything below this line */
 
-#define I4COMPLEX_TYPE		   1
-#define I16COMPLEX_TYPE		   2
-
-#if INPUT_TYPE == I4COMPLEX_TYPE
+#if NR_BITS_PER_SAMPLE == 4
 #define INPUT_SAMPLE_TYPE	   i4complex
-#elif INPUT_TYPE == I16COMPLEX_TYPE
+#elif NR_BITS_PER_SAMPLE == 8
+#define INPUT_SAMPLE_TYPE	   i8complex
+#elif NR_BITS_PER_SAMPLE == 16
 #define INPUT_SAMPLE_TYPE	   i16complex
 #else
-#error Bad INPUT_TYPE
+#error Bad NR_BITS_PER_SAMPLE
 #endif
 
 #endif
diff --git a/LCS/Common/include/Common/ComplexStdInt.h b/LCS/Common/include/Common/ComplexStdInt.h
index 845408344675cc450a93cb34ea18bb22e2c3173a..24706e40638fc877019a9a0e0ab424a6eabb1b12 100644
--- a/LCS/Common/include/Common/ComplexStdInt.h
+++ b/LCS/Common/include/Common/ComplexStdInt.h
@@ -31,10 +31,13 @@
 namespace LOFAR {
 
   namespace TYPES {
+    typedef std::complex<int8>   i8complex;
     typedef std::complex<int16>  i16complex;
     typedef std::complex<uint16> u16complex;
   }
 
+  inline TYPES::i8complex makei8complex (TYPES::int8 re, TYPES::int8 im)
+    { return TYPES::i8complex(re,im); }
   inline TYPES::i16complex makei16complex (TYPES::uint16 re, TYPES::uint16 im)
     { return TYPES::i16complex(re,im); }
   inline TYPES::u16complex makeu16complex (TYPES::uint16 re, TYPES::uint16 im)
diff --git a/LCS/Common/include/Common/lofar_complex.h b/LCS/Common/include/Common/lofar_complex.h
index 3ed9e6d1d1bc6d7f5291ba4c27834800a8955bbe..287a6dd380a392529c7ad7bbbcd211eb9fb55313 100644
--- a/LCS/Common/include/Common/lofar_complex.h
+++ b/LCS/Common/include/Common/lofar_complex.h
@@ -60,6 +60,7 @@ namespace LOFAR
 {
   // Define complex types in LOFAR namespace.
   using TYPES::i4complex;
+  using TYPES::i8complex;
   using TYPES::i16complex;
   using TYPES::u16complex;
   using TYPES::fcomplex;
@@ -69,6 +70,10 @@ namespace LOFAR
     return z;
   }
 
+  inline static i4complex makei4complex(i8complex &z) {
+    return makei4complex(real(z), imag(z));
+  }
+
   inline static i4complex makei4complex(i16complex &z) {
     return makei4complex(real(z), imag(z));
   }
@@ -85,10 +90,38 @@ namespace LOFAR
     return makei4complex((int) real(z), (int) imag(z));
   }
 
+  inline static i8complex makei8complex(i4complex &z) {
+    return makei8complex(real(z), imag(z));
+  }
+
+  inline static i8complex makei8complex(i8complex &z) {
+    return z;
+  }
+
+  inline static i8complex makei8complex(i16complex &z) {
+    return makei8complex(real(z), imag(z));
+  }
+
+  inline static i8complex makei8complex(u16complex &z) {
+    return makei8complex(real(z), imag(z));
+  }
+
+  inline static i8complex makei8complex(fcomplex &z) {
+    return makei8complex((int) real(z), (int) imag(z));
+  }
+
+  inline static i8complex makei8complex(dcomplex &z) {
+    return makei8complex((int) real(z), (int) imag(z));
+  }
+
   inline static i16complex makei16complex(i4complex &z) {
     return makei16complex(real(z), imag(z));
   }
 
+  inline static i16complex makei16complex(i8complex &z) {
+    return makei16complex(real(z), imag(z));
+  }
+
   inline static i16complex makei16complex(i16complex &z) {
     return z;
   }
@@ -109,6 +142,10 @@ namespace LOFAR
     return makeu16complex(real(z), imag(z));
   }
 
+  inline static u16complex makeu16complex(i8complex &z) {
+    return makeu16complex(real(z), imag(z));
+  }
+
   inline static u16complex makeu16complex(i16complex &z) {
     return makeu16complex(real(z), imag(z));
   }
@@ -129,6 +166,10 @@ namespace LOFAR
     return makefcomplex((float) real(z), (float) imag(z));
   }
 
+  inline static fcomplex makefcomplex(i8complex &z) {
+    return makefcomplex((float) real(z), (float) imag(z));
+  }
+
   inline static fcomplex makefcomplex(i16complex &z) {
     return makefcomplex((float) real(z), (float) imag(z));
   }
@@ -149,6 +190,10 @@ namespace LOFAR
     return makedcomplex((double) real(z), (double) imag(z));
   }
 
+  inline static dcomplex makedcomplex(i8complex &z) {
+    return makedcomplex((double) real(z), (double) imag(z));
+  }
+
   inline static dcomplex makedcomplex(i16complex &z) {
     return makedcomplex((double) real(z), (double) imag(z));
   }