From cb15ffc3c756679092da19c96b9f9b4c0e2dc02c Mon Sep 17 00:00:00 2001 From: John Romein <romein@astron.nl> Date: Fri, 11 Feb 2011 15:58:19 +0000 Subject: [PATCH] bug 225: Added little-endian int->float conversion routines, necessary to bypass PPF bank. --- .gitattributes | 1 + RTCP/CNProc/src/FIR_Asm.S | 266 ++++++++++++++++++++++++++++++++ RTCP/CNProc/src/FIR_Asm.h | 3 + RTCP/CNProc/src/PPF.h | 8 +- RTCP/CNProc/test/CMakeLists.txt | 1 + RTCP/CNProc/test/tFIR_Asm.cc | 83 ++++++++++ 6 files changed, 358 insertions(+), 4 deletions(-) create mode 100644 RTCP/CNProc/test/tFIR_Asm.cc diff --git a/.gitattributes b/.gitattributes index 66b43f80648..f871b594a5a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2079,6 +2079,7 @@ RTCP/CNProc/test/filterTestResult.ps -text RTCP/CNProc/test/inversePPFTestOutput.dat -text RTCP/CNProc/test/inversePPFTestResult.ps -text RTCP/CNProc/test/tDedispersion.cc -text +RTCP/CNProc/test/tFIR_Asm.cc -text RTCP/CNProc/test/tInversePPF.cc -text RTCP/CNProc/test/tPencilBeamFormer.cc -text RTCP/CNProc/test/tPencilBeamFormer.sh -text diff --git a/RTCP/CNProc/src/FIR_Asm.S b/RTCP/CNProc/src/FIR_Asm.S index 320fc2df4db..a2ac5e52efc 100644 --- a/RTCP/CNProc/src/FIR_Asm.S +++ b/RTCP/CNProc/src/FIR_Asm.S @@ -2509,4 +2509,270 @@ _fast_memcpy: lfpdux 14,1,8 addi 1,1,16 blr + + + # little endian i16complex -> float + +.global _ZN5LOFAR4RTCP8_convertISt7complexIsEEEvPS2_IfEPKT_j +_ZN5LOFAR4RTCP8_convertISt7complexIsEEEvPS2_IfEPKT_j: + + stwu 1,-32(1) + + lis 8,sub_value@ha # load sub_values + li 9,sub_value@l + lfpdx 0,8,9 + + lis 12,0x0080 + ori 12,12,0x0080 + + srwi 5,5,1 + subi 5,5,2 + mtctr 5 + + subi 3,3,8 + + li 8,8 + li 9,4 + li 10,14 + li 11,6 + addi 5,1,16 + + stfpdx 0,0,1 # initialize int->fp conversion area + stfpdx 0,0,5 + + lwz 6,0(4) + + lwzux 7,4,9 + xor 6,6,12 + sthbrx 6,10,1 + srwi 6,6,16 + sthbrx 6,11,1 + + lwzux 6,4,9 + xor 7,7,12 + sthbrx 7,10,5 + srwi 7,7,16 + sthbrx 7,11,5 + lfpdx 1,0,1 + + lwzux 7,4,9 + xor 6,6,12 + sthbrx 6,10,1 + srwi 6,6,16 + sthbrx 6,11,1 + fpsub 1,1,0 + lfpdx 2,0,5 + +0: + lwzux 6,4,9 + xor 7,7,12 + sthbrx 7,10,5 + srwi 7,7,16 + sthbrx 7,11,5 + fpsub 2,2,0 + stfpsux 1,3,8 + lfpdx 1,0,1 + + lwzux 7,4,9 + xor 6,6,12 + sthbrx 6,10,1 + srwi 6,6,16 + sthbrx 6,11,1 + fpsub 1,1,0 + stfpsux 2,3,8 + lfpdx 2,0,5 + + bdnz 0b + + xor 7,7,12 + sthbrx 7,10,5 + srwi 7,7,16 + sthbrx 7,11,5 + fpsub 2,2,0 + stfpsux 1,3,8 + lfpdx 1,0,1 + + fpsub 1,1,0 + stfpsux 2,3,8 + lfpdx 2,0,5 + + fpsub 2,2,0 + stfpsux 1,3,8 + + stfpsux 2,3,8 + + addi 1,1,32 + blr + + + # i8complex -> float + +.global _ZN5LOFAR4RTCP8_convertISt7complexIaEEEvPS2_IfEPKT_j +_ZN5LOFAR4RTCP8_convertISt7complexIaEEEvPS2_IfEPKT_j: + + lis 11,_ZN5LOFAR4RTCP13_FIR_fp_tableE@ha + la 11,_ZN5LOFAR4RTCP13_FIR_fp_tableE@l(11) + + srwi 5,5,2 + subi 5,5,2 + mtctr 5 + + subi 3,3,8 + + li 10,8 + li 9,2 + + lhz 5,0(4) + lhzux 6,4,9 + lhzux 7,4,9 + lhzux 8,4,9 + + slwi 5,5,3 + lfpsx 0,11,5 + lhzux 5,4,9 + + slwi 6,6,3 + lfpsx 1,11,6 + lhzux 6,4,9 + + slwi 7,7,3 + lfpsx 2,11,7 + lhzux 7,4,9 + + slwi 8,8,3 + lfpsx 3,11,8 + lhzux 8,4,9 + +0: + slwi 5,5,3 + stfpsux 0,3,10 + lfpsx 0,11,5 + lhzux 5,4,9 + + slwi 6,6,3 + stfpsux 1,3,10 + lfpsx 1,11,6 + lhzux 6,4,9 + + slwi 7,7,3 + stfpsux 2,3,10 + lfpsx 2,11,7 + lhzux 7,4,9 + + slwi 8,8,3 + stfpsux 3,3,10 + lfpsx 3,11,8 + lhzux 8,4,9 + + bdnz 0b + + slwi 5,5,3 + stfpsux 0,3,10 + lfpsx 0,11,5 + + slwi 6,6,3 + stfpsux 1,3,10 + lfpsx 1,11,6 + + slwi 7,7,3 + stfpsux 2,3,10 + lfpsx 2,11,7 + + slwi 8,8,3 + stfpsux 3,3,10 + lfpsx 3,11,8 + + stfpsux 0,3,10 + stfpsux 1,3,10 + stfpsux 2,3,10 + stfpsux 3,3,10 + + blr + + + # i4complex -> float + +.global _ZN5LOFAR4RTCP8_convertINS_5TYPES9i4complexEEEvPSt7complexIfEPKT_j +_ZN5LOFAR4RTCP8_convertINS_5TYPES9i4complexEEEvPSt7complexIfEPKT_j: + + lis 11,_ZN5LOFAR4RTCP13_FIR_fp_tableE@ha + la 11,_ZN5LOFAR4RTCP13_FIR_fp_tableE@l(11) + + srwi 5,5,2 + subi 5,5,2 + mtctr 5 + + subi 3,3,8 + + li 10,8 + li 9,1 + + lbz 5,0(4) + lbzux 6,4,9 + lbzux 7,4,9 + lbzux 8,4,9 + + slwi 5,5,3 + lfpsx 0,11,5 + lbzux 5,4,9 + + slwi 6,6,3 + lfpsx 1,11,6 + lbzux 6,4,9 + + slwi 7,7,3 + lfpsx 2,11,7 + lbzux 7,4,9 + + slwi 8,8,3 + lfpsx 3,11,8 + lbzux 8,4,9 + +0: + slwi 5,5,3 + stfpsux 0,3,10 + lfpsx 0,11,5 + lbzux 5,4,9 + + slwi 6,6,3 + stfpsux 1,3,10 + lfpsx 1,11,6 + lbzux 6,4,9 + + slwi 7,7,3 + stfpsux 2,3,10 + lfpsx 2,11,7 + lbzux 7,4,9 + + slwi 8,8,3 + stfpsux 3,3,10 + lfpsx 3,11,8 + lbzux 8,4,9 + + bdnz 0b + + slwi 5,5,3 + stfpsux 0,3,10 + lfpsx 0,11,5 + + slwi 6,6,3 + stfpsux 1,3,10 + lfpsx 1,11,6 + + slwi 7,7,3 + stfpsux 2,3,10 + lfpsx 2,11,7 + + slwi 8,8,3 + stfpsux 3,3,10 + lfpsx 3,11,8 + + stfpsux 0,3,10 + stfpsux 1,3,10 + stfpsux 2,3,10 + stfpsux 3,3,10 + + blr + + #endif diff --git a/RTCP/CNProc/src/FIR_Asm.h b/RTCP/CNProc/src/FIR_Asm.h index 947bfd57973..1d85d51da62 100644 --- a/RTCP/CNProc/src/FIR_Asm.h +++ b/RTCP/CNProc/src/FIR_Asm.h @@ -22,6 +22,7 @@ #define LOFAR_CNPROC_FIR_ASM_H #if defined HAVE_BGP +#include <Common/lofar_complex.h> #include <Interface/Config.h> namespace LOFAR { @@ -38,6 +39,8 @@ template <typename SAMPLE_TYPE> extern void _filter(unsigned nrChannels, fcomplex out[], int nr_samples_div_16); +template <typename SAMPLE_TYPE> extern void _convert(fcomplex out[], const SAMPLE_TYPE samples[], unsigned count); + extern "C" { void _transpose_4x8(fcomplex *out, const fcomplex *in, diff --git a/RTCP/CNProc/src/PPF.h b/RTCP/CNProc/src/PPF.h index d4f8769e95b..7db2e8436f8 100644 --- a/RTCP/CNProc/src/PPF.h +++ b/RTCP/CNProc/src/PPF.h @@ -38,13 +38,13 @@ template <typename SAMPLE_TYPE> class PPF: boost::noncopyable void computeFlags(unsigned stat, const SubbandMetaData *metaData, FilteredData *); void filter(unsigned stat, double centerFrequency, const SubbandMetaData *metaData, const TransposedData<SAMPLE_TYPE> *, FilteredData *); - private: - void init_fft(), destroy_fft(); - #if !defined PPF_C_IMPLEMENTATION - void initConstantTable(); + static void initConstantTable(); #endif + private: + void init_fft(), destroy_fft(); + #if defined PPF_C_IMPLEMENTATION fcomplex phaseShift(unsigned time, unsigned chan, double baseFrequency, double delayAtBegin, double delayAfterEnd) const; #else diff --git a/RTCP/CNProc/test/CMakeLists.txt b/RTCP/CNProc/test/CMakeLists.txt index 5e0f7013410..c0dcf82b448 100644 --- a/RTCP/CNProc/test/CMakeLists.txt +++ b/RTCP/CNProc/test/CMakeLists.txt @@ -7,6 +7,7 @@ include_directories(${PACKAGE_SOURCE_DIR}/src) lofar_add_test(tCN_Processing tCN_Processing.cc) lofar_add_test(tBeamForming tBeamForming.cc) lofar_add_test(tDedispersion tDedispersion.cc) +lofar_add_test(tFIR_Asm tFIR_Asm.cc) lofar_add_test(tPencilBeamFormer tPencilBeamFormer.cc) lofar_add_test(tStokes tStokes.cc) lofar_add_test(tInversePPF tInversePPF.cc) diff --git a/RTCP/CNProc/test/tFIR_Asm.cc b/RTCP/CNProc/test/tFIR_Asm.cc new file mode 100644 index 00000000000..a9fbc2eda8e --- /dev/null +++ b/RTCP/CNProc/test/tFIR_Asm.cc @@ -0,0 +1,83 @@ +#include <lofar_config.h> + +#include <FIR_Asm.h> +#include <PPF.h> +#include <Common/Timer.h> + +#include <iostream> + + +#if defined HAVE_BGP +using namespace LOFAR; +using namespace LOFAR::RTCP; +using namespace LOFAR::TYPES; +#endif + +#define SIZE 131072 + +int main() +{ +#if defined HAVE_BGP + { + i16complex in[SIZE] = { + makei16complex(0x0100, 0x0200), + makei16complex(0x0300, 0x0400), + makei16complex(0x0500, 0x0600), + }; + + in[SIZE - 1] = makei16complex(0x0700, 0x0801); + + fcomplex out[SIZE]; + + NSTimer timer("little endian i16complex -> float", true); + timer.start(); + _convert(out, in, SIZE); + timer.stop(); + + std::cout << out[0] << ' ' << out[1] << ' ' << out[2] << ' ' << out[SIZE - 1] << std::endl; + } + + { + PPF<i8complex>::initConstantTable(); + + i8complex in[SIZE] = { + makei8complex(1, 2), + makei8complex(3, 4), + makei8complex(5, 6), + }; + + in[SIZE - 1] = makei8complex(7, 8); + + fcomplex out[SIZE]; + + NSTimer timer("little endian i8complex -> float", true); + timer.start(); + _convert(out, in, SIZE); + timer.stop(); + + std::cout << out[0] << ' ' << out[1] << ' ' << out[2] << ' ' << out[SIZE - 1] << std::endl; + } + + { + PPF<i4complex>::initConstantTable(); + + i4complex in[SIZE] = { + makei4complex(0.5, 1.5), + makei4complex(2.5, 3.5), + makei4complex(4.5, 5.5), + }; + + in[SIZE - 1] = makei4complex(-1.5, -0.5); + + fcomplex out[SIZE]; + + NSTimer timer("little endian i4complex -> float", true); + timer.start(); + _convert(out, in, SIZE); + timer.stop(); + + std::cout << out[0] << ' ' << out[1] << ' ' << out[2] << ' ' << out[SIZE - 1] << std::endl; + } +#endif + return 0; +} -- GitLab