diff --git a/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt b/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt index e004746f046157c22a0c677b50aec3d0549a439d..cbd24ffa5edd3f502c5840525e726117043e3ce8 100644 --- a/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt +++ b/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt @@ -22,85 +22,96 @@ Pipeline For max size, we assume: - 48 stations - 1 subband - - 1 second blocks (195312.5 samples) rounded to next multiple of 4096 + - 1 second blocks (195312.5 samples) rounded to next multiple of 4096 (= 196608 samples). Note: - MiB = 2^20 bytes (1048576) - (*) = requires change from current implementation + MiB = 2^20 bytes (= 1048576 bytes). Flow: Data dimensions: Max size (fcomplex): Buffer: =================================================================================================================== -(input) [station][samples][pol] [48][196608][2] = 72 MiB A +(input) [station][sample][pol] [48][196608][2] = 72 MiB A | (i16complex) V IntToFloat + Transpose - | [station][pol][samples] [48][2][196608] = 144 MiB B + | [station][pol][sample] [48][2][196608] = 144 MiB B + V +FFT-shift {inplace} + | [station][pol][sample] [48][2][196608] = 144 MiB B V FFT-64 {inplace} - | [station][pol][samples][channel] [48][2][3072][64] = 144 MiB B + | [station][pol][sample][channel] [48][2][3072][64] = 144 MiB B + V +Delay compensation + Transpose {I/O: delays} + | [station][pol][channel][sample] [48][2][64][3072] = 144 MiB A V -Delay compensation (*: no transpose) {I/O: delays} - | [station][pol][channel][samples] [48][2][64][3072] = 144 MiB A +FFT-shift {inplace} + | [station][pol][channel][sample] [48][2][64][3072] = 144 MiB A V FFT-64 {inplace} - | [station][pol][chan1][samples][chan2] [48][2][64][48][64] = 144 MiB A + | [station][pol][chan1][sample][chan2] [48][2][64][48][64] = 144 MiB A V BandPass + Transpose {I/O: weights} - | [station][chan1][chan2][samples][pol] [48][64][64][48][2] = 144 MiB B - V = [stations][channel][samples][pol] + | [station][chan1][chan2][sample][pol] [48][64][64][48][2] = 144 MiB B + V = [station][channel][sample][pol] X Complex Voltages/Coherent Stokes: ----------------------------------- - X [station][channel][samples][pol] [48][4096][48][2] = 144 MiB B + X [station][channel][sample][pol] [48][4096][48][2] = 144 MiB B | V BeamFormer {I/O: weights} - | [channel][samples][tab][pol] [4096][48][tab][2] = 3 MiB/TAB A + | [channel][sample][tab][pol] [4096][48][tab][2] = 3 MiB/TAB A V Transpose - | [tab][pol][samples][channel] [tab][2][48][4096] = 3 MiB/TAB 1ch: CS: C, CV: D + | [tab][pol][sample][channel] [tab][2][48][4096] = 3 MiB/TAB 1ch: CS: C, CV: D | Nch: CS: D, CV: C V iFFT-4k {inplace} - | [tab][pol][samples] [tab][2][196608] = 3 MiB/TAB 1ch: CS: C, CV: D + | [tab][pol][sample] [tab][2][196608] = 3 MiB/TAB 1ch: CS: C, CV: D + | Nch: CS: D, CV: C + V +FFT-shift {inplace} + | [tab][pol][sample] [tab][2][196608] = 3 MiB/TAB 1ch: CS: C, CV: D | Nch: CS: D, CV: C V FIR-16 (if >1ch) - | [tab][pol][samples] [tab][2][196608] = 3 MiB/TAB 1ch: CS: -, CV: - + | [tab][pol][sample] [tab][2][196608] = 3 MiB/TAB 1ch: CS: -, CV: - | Nch: CS: C, CV: D V FFT-16 {inplace} (if >1ch) - | [tab][pol][samples][channel] [tab][2][12288][16] = 3 MiB/TAB 1ch: CS: -, CV: - + | [tab][pol][sample][channel] [tab][2][12288][16] = 3 MiB/TAB 1ch: CS: -, CV: - | Nch: CS: C, CV: D V -Coherent Stokes (*: no transpose) - | [tab][stokes][samples][channel] [tab][4][12288][16] = 0.75 MiB/TAB/Stokes 1ch: CS: D, CV: - +Coherent Stokes + | [tab][stokes][sample][channel] [tab][4][12288][16] = 0.75 MiB/TAB/Stokes 1ch: CS: D, CV: - | (float) Nch: CS: D, CV: - V (output) Incoherent Stokes: ----------------------------------- - X [station][channel][samples][pol] [48][4096][48][2] = 144 MiB B + X [station][channel][sample][pol] [48][4096][48][2] = 144 MiB B | V Transpose + Copy - | [station][pol][samples][channel] [48][2][48][4096] = 144 MiB A + | [station][pol][sample][channel] [48][2][48][4096] = 144 MiB A V iFFT-4k {inplace} - | [station][pol][samples] [48][2][196608] = 144 MiB A + | [station][pol][sample] [48][2][196608] = 144 MiB A + V +FFT-shift {inplace} + | [station][pol][sample] [48][2][196608] = 144 MiB A V FIR-16 (if >1ch) - | [station][pol][samples] [48][2][196608] = 144 MiB B + | [station][pol][sample] [48][2][196608] = 144 MiB B | V FFT-16 {inplace} (if >1ch) - | [station][pol][samples][channel] [48][2][12288][16] = 144 MiB B - | + | [station][pol][sample][channel] [48][2][12288][16] = 144 MiB B V -Incoherent Stokes (*: no transpose) - | [stokes][samples][channel] [4][12288][16] = 3 MiB E +Incoherent Stokes + | [stokes][sample][channel] [4][12288][16] = 3 MiB E V (float) (output) diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc index 78601b3b78abb6ea8f02af3e24db5fd9cfc20c00..f33b69542d2b811aa67402b17df6704f784d91f6 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc @@ -34,7 +34,7 @@ namespace LOFAR delayCompensation(delayCompensationParams(ps)), beamFormer(beamFormerParams(ps)), coherentTranspose(coherentTransposeParams(ps)), - fftShiftKernel(FFTShiftKernelParams(ps)), + fftShift(fftShiftParams(ps)), firFilter(firFilterParams(ps, nrSubbandsPerSubbandProc)), coherentStokes(coherentStokesParams(ps)), incoherentStokes(incoherentStokesParams(ps)), @@ -129,7 +129,7 @@ namespace LOFAR } FFTShiftKernel::Parameters - BeamFormerFactories::FFTShiftKernelParams(const Parset &ps) const + BeamFormerFactories::fftShiftParams(const Parset &ps) const { FFTShiftKernel::Parameters params(ps); // Currently a static in the subband proc diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h index 9cbc56f4ce3ba9758a837ee867fcf8c9214352e5..fe2400634f55219c62b19421a4b838f8f525c9ad 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h +++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h @@ -50,7 +50,7 @@ namespace LOFAR KernelFactory<DelayAndBandPassKernel> delayCompensation; KernelFactory<BeamFormerKernel> beamFormer; KernelFactory<CoherentStokesTransposeKernel> coherentTranspose; - KernelFactory<FFTShiftKernel> fftShiftKernel; + KernelFactory<FFTShiftKernel> fftShift; KernelFactory<FIR_FilterKernel> firFilter; KernelFactory<CoherentStokesKernel> coherentStokes; KernelFactory<IncoherentStokesKernel> incoherentStokes; @@ -74,7 +74,7 @@ namespace LOFAR delayCompensationParams(const Parset &ps) const; FFTShiftKernel::Parameters - FFTShiftKernelParams(const Parset &ps) const; + fftShiftParams(const Parset &ps) const; FIR_FilterKernel::Parameters firFilterParams(const Parset &ps, size_t nrSubbandsPerSubbandProc) const; diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc index 8738668893b08229f54e6b3fc5644393c70a39ae..ec57b6b3e6ced765564b4f3a33d435bbc684321f 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc @@ -98,6 +98,11 @@ namespace LOFAR intToFloatBuffers(devInput.inputSamples, devB), intToFloatKernel(factories.intToFloat.create(queue, intToFloatBuffers)), + // FFTShift: B -> B + firstFFTShiftBuffers(devB, devB), + firstFFTShiftKernel( + factories.fftShift.create(queue, firstFFTShiftBuffers)), + // FFT: B -> B firstFFT(queue, ps.settings.beamFormer.nrDelayCompensationChannels, @@ -112,6 +117,11 @@ namespace LOFAR delayCompensationKernel( factories.delayCompensation.create(queue, delayCompensationBuffers)), + // FFTShift: A -> A + secondFFTShiftBuffers(devA, devA), + secondFFTShiftKernel( + factories.fftShift.create(queue, secondFFTShiftBuffers)), + // FFT: A -> A secondFFT(queue, ps.settings.beamFormer.nrHighResolutionChannels / @@ -167,6 +177,12 @@ namespace LOFAR ps.settings.beamFormer.nrHighResolutionChannels), false, coherentTransposeBuffers.output), + // fftshift: C/D -> C/D (in-place) = transposeBuffers.output + inverseFFTShiftBuffers( + coherentTransposeBuffers.output, coherentTransposeBuffers.output), + inverseFFTShiftKernel( + factories.fftShift.create(queue, inverseFFTShiftBuffers)), + // FIR filter: D/C -> C/D // // Input buffer: @@ -225,6 +241,11 @@ namespace LOFAR ps.settings.beamFormer.nrHighResolutionChannels), false, devA), + // inverse FFTShift: A -> A + incoherentInverseFFTShiftBuffers(devA, devA), + incoherentInverseFFTShiftKernel( + factories.fftShift.create(queue, incoherentInverseFFTShiftBuffers)), + // FIR filter: A -> B devIncoherentFilterWeights( context, @@ -335,17 +356,21 @@ namespace LOFAR BeamFormerSubbandProc::Counters::Counters(gpu::Context &context) : intToFloat(context), + firstFFTShift(context), firstFFT(context), delayBp(context), + secondFFTShift(context), secondFFT(context), correctBandpass(context), beamformer(context), transpose(context), inverseFFT(context), + inverseFFTShift(context), firFilterKernel(context), finalFFT(context), coherentStokes(context), incoherentInverseFFT(context), + incoherentInverseFFTShift(context), incoherentFirFilterKernel(context), incoherentFinalFFT(context), incoherentStokes(context), @@ -362,13 +387,16 @@ namespace LOFAR LOG_INFO_STR( "**** BeamFormerSubbandProc GPU mean and stDev ****" << endl << std::setw(20) << "(intToFloat)" << intToFloat.stats << endl << + std::setw(20) << "(firstFFTShift)" << firstFFTShift.stats << endl << std::setw(20) << "(firstFFT)" << firstFFT.stats << endl << std::setw(20) << "(delayBp)" << delayBp.stats << endl << + std::setw(20) << "(secondFFTShift)" << secondFFTShift.stats << endl << std::setw(20) << "(secondFFT)" << secondFFT.stats << endl << std::setw(20) << "(correctBandpass)" << correctBandpass.stats << endl << std::setw(20) << "(beamformer)" << beamformer.stats << endl << std::setw(20) << "(transpose)" << transpose.stats << endl << std::setw(20) << "(inverseFFT)" << inverseFFT.stats << endl << + std::setw(20) << "(inverseFFTShift)" << inverseFFTShift.stats << endl << std::setw(20) << "(firFilterKernel)" << firFilterKernel.stats << endl << std::setw(20) << "(finalFFT)" << finalFFT.stats << endl << std::setw(20) << "(coherentStokes)" << coherentStokes.stats << endl << @@ -376,6 +404,7 @@ namespace LOFAR std::setw(20) << "(visibilities)" << visibilities.stats << endl << std::setw(20) << "(incoherentOutput )" << incoherentOutput.stats << endl << std::setw(20) << "(incoherentInverseFFT)" << incoherentInverseFFT.stats << endl << + std::setw(20) << "(incoherentInverseFFTShift)" << incoherentInverseFFTShift.stats << endl << std::setw(20) << "(incoherentFirFilterKernel)" << incoherentFirFilterKernel.stats << endl << std::setw(20) << "(incoherentFinalFFT)" << incoherentFinalFFT.stats << endl << std::setw(20) << "(incoherentStokes)" << incoherentStokes.stats << endl << @@ -419,6 +448,7 @@ namespace LOFAR // Otherwise, a kernel arg may not be set... intToFloatKernel->enqueue(input.blockID, counters.intToFloat); + firstFFTShiftKernel->enqueue(input.blockID, counters.firstFFTShift); firstFFT.enqueue(input.blockID, counters.firstFFT); delayCompensationKernel->enqueue( @@ -426,6 +456,7 @@ namespace LOFAR ps.settings.subbands[subband].centralFrequency, ps.settings.subbands[subband].SAP); + secondFFTShiftKernel->enqueue(input.blockID, counters.secondFFTShift); secondFFT.enqueue(input.blockID, counters.secondFFT); bandPassCorrectionKernel->enqueue( @@ -440,7 +471,9 @@ namespace LOFAR ps.settings.subbands[subband].SAP); coherentTransposeKernel->enqueue(input.blockID, counters.transpose); + inverseFFT.enqueue(input.blockID, counters.inverseFFT); + inverseFFTShiftKernel->enqueue(input.blockID, counters.inverseFFTShift); if (coherentStokesPPF) { @@ -464,6 +497,8 @@ namespace LOFAR incoherentInverseFFT.enqueue( input.blockID, counters.incoherentInverseFFT); + incoherentInverseFFTShiftKernel->enqueue( + input.blockID, counters.incoherentInverseFFTShift); if (incoherentStokesPPF) { diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h index 9e7dc2562a28897ac83e9a73e17f326ada61abab..c946e76b1004d899a63f6913aeef21ea82f500a4 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h +++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h @@ -37,6 +37,7 @@ #include <GPUProc/Kernels/CoherentStokesTransposeKernel.h> #include <GPUProc/Kernels/CoherentStokesKernel.h> #include <GPUProc/Kernels/DelayAndBandPassKernel.h> +#include <GPUProc/Kernels/FFTShiftKernel.h> #include <GPUProc/Kernels/FFT_Kernel.h> #include <GPUProc/Kernels/FIR_FilterKernel.h> #include <GPUProc/Kernels/IncoherentStokesKernel.h> @@ -90,18 +91,22 @@ namespace LOFAR // gpu kernel counters PerformanceCounter intToFloat; + PerformanceCounter firstFFTShift; PerformanceCounter firstFFT; PerformanceCounter delayBp; + PerformanceCounter secondFFTShift; PerformanceCounter secondFFT; PerformanceCounter correctBandpass; PerformanceCounter beamformer; PerformanceCounter transpose; PerformanceCounter inverseFFT; + PerformanceCounter inverseFFTShift; PerformanceCounter firFilterKernel; PerformanceCounter finalFFT; PerformanceCounter coherentStokes; PerformanceCounter incoherentInverseFFT; + PerformanceCounter incoherentInverseFFTShift; PerformanceCounter incoherentFirFilterKernel; PerformanceCounter incoherentFinalFFT; PerformanceCounter incoherentStokes; @@ -149,6 +154,10 @@ namespace LOFAR IntToFloatKernel::Buffers intToFloatBuffers; std::auto_ptr<IntToFloatKernel> intToFloatKernel; + // First FFT-shift + FFTShiftKernel::Buffers firstFFTShiftBuffers; + std::auto_ptr<FFTShiftKernel> firstFFTShiftKernel; + // First (64 points) FFT FFT_Kernel firstFFT; @@ -156,6 +165,10 @@ namespace LOFAR DelayAndBandPassKernel::Buffers delayCompensationBuffers; std::auto_ptr<DelayAndBandPassKernel> delayCompensationKernel; + // Second FFT-shift + FFTShiftKernel::Buffers secondFFTShiftBuffers; + std::auto_ptr<FFTShiftKernel> secondFFTShiftKernel; + // Second (64 points) FFT FFT_Kernel secondFFT; @@ -182,6 +195,10 @@ namespace LOFAR // inverse (4k points) FFT FFT_Kernel inverseFFT; + // inverse FFT-shift + FFTShiftKernel::Buffers inverseFFTShiftBuffers; + std::auto_ptr<FFTShiftKernel> inverseFFTShiftKernel; + // Poly-phase filter (FIR + FFT) gpu::DeviceMemory devFilterWeights; gpu::DeviceMemory devFilterHistoryData; @@ -205,6 +222,10 @@ namespace LOFAR // Inverse (4k points) FFT FFT_Kernel incoherentInverseFFT; + // Inverse FFT-shift + FFTShiftKernel::Buffers incoherentInverseFFTShiftBuffers; + std::auto_ptr<FFTShiftKernel> incoherentInverseFFTShiftKernel; + // Poly-phase filter (FIR + FFT) gpu::DeviceMemory devIncoherentFilterWeights; gpu::DeviceMemory devIncoherentFilterHistoryData; diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc index a01887b3da6827c31b0fbdb981e9957af99d0197..3b75c8ea5530d0aa09c9f989143431ca562f89ec 100644 --- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc +++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc @@ -23,6 +23,7 @@ #include <complex> #include <cmath> +#include <iomanip> #include <Common/LofarLogger.h> #include <CoInterface/Parset.h> @@ -39,16 +40,20 @@ using namespace LOFAR::TYPES; template<typename T> T inputSignal(size_t t) { size_t nrBits = sizeof(T) / 2 * 8; - double freq = 1.0 / 4.0; // in samples - // double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples double amp = (1 << (nrBits - 1)) - 1; - +#if 1 // Toggle to experiment with pulse like input + // Sine wave + // double freq = 1.0 / 4.0; // in samples + double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples double angle = (double)t * 2.0 * M_PI * freq; - double s = ::sin(angle); double c = ::cos(angle); - return T(::round(amp * c), ::round(amp * s)); +#else + // Pulse train + if (t % (2 * 64 + 17) == 0) return T(amp); + else return T(0); +#endif } int main() { @@ -152,7 +157,7 @@ int main() { // Block number: 0 .. inf in.blockID.block = 0; - // Subband index in the observation: [0, ps.nrSubbands()) + // Subband index in the observation: [0, ps.nrSubbands()) in.blockID.globalSubbandIdx = 0; // Subband index for this pipeline/workqueue: [0, subbandIndices.size()) @@ -206,9 +211,10 @@ int main() { for (size_t s = 0; s < nrStokes; s++) for (size_t t = 0; t < nrSamples; t++) for (size_t c = 0; c < nrChannels; c++) - ASSERTSTR(fpEquals(out[0][s][t][c], outVal), + ASSERTSTR(fpEquals(out[0][s][t][c], outVal, 1e-4f), "out[" << s << "][" << t << "][" << c << "] = " << - out[0][s][t][c] << "; outVal = " << outVal); + setprecision(12) << out[0][s][t][c] << + "; outVal = " << outVal); return 0; } diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCoherentStokesBeamFormerSubbandProcProcessSb.cc b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCoherentStokesBeamFormerSubbandProcProcessSb.cc index eff715074769797ed01c8da923f7f235ba9fd3ab..19b703c87012bf0588faf5d5e7e2c7f370d3dceb 100644 --- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCoherentStokesBeamFormerSubbandProcProcessSb.cc +++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCoherentStokesBeamFormerSubbandProcProcessSb.cc @@ -40,16 +40,20 @@ using namespace LOFAR::TYPES; template<typename T> T inputSignal(size_t t) { size_t nrBits = sizeof(T) / 2 * 8; - double freq = 1.0 / 4.0; // in samples - // double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples double amp = (1 << (nrBits - 1)) - 1; - +#if 1 // Toggle to experiment with pulse type input + // Sine wave + // double freq = 1.0 / 4.0; // in samples + double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples double angle = (double)t * 2.0 * M_PI * freq; - double s = ::sin(angle); double c = ::cos(angle); - return T(::round(amp * c), ::round(amp * s)); +#else + // Pulse train + if (t % (2 * 64 + 17) == 0) return T(amp); + else return T(0); +#endif } int main() { @@ -219,7 +223,7 @@ int main() { for (size_t t = 0; t < nrSamples; t++) for (size_t c = 0; c < nrChannels; c++) { - ASSERTSTR(fpEquals(out[tab][s][t][c], outVal), + ASSERTSTR(fpEquals(out[tab][s][t][c], outVal, 1e-4f), "out[" << tab << "][" << s << "][" << t << "][" << c << "] = " << setprecision(12) << out[tab][s][t][c] << "; outVal = " << outVal); }