Task #5287: Merge with trunk (Add fftshift kernel to BeamFormer pipeline)...

Task #5287: Merge with trunk (Add fftshift kernel to BeamFormer pipeline) Updated test for new output. Small refactoring in namings. Reviewed by Wouter

Task #5287: Merge with trunk (Add fftshift kernel to BeamFormer pipeline)...
1475c354 · Wouter Klijn · 9b372438 · e977b8ab · 1475c354 · 1475c354
Commit 1475c354 authored 11 years ago by Wouter Klijn
--- a/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt
+++ b/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt
@@ -22,85 +22,96 @@ Pipeline
 For max size, we assume:
 - 48 stations
 - 1 subband
- - 1 second blocks (195312.5 samples) rounded to next multiple of 4096
+ - 1 second blocks (195312.5 samples) rounded to next multiple of 4096 (= 196608 samples).
 Note:
-  MiB = 2^20 bytes (1048576)
+  MiB = 2^20 bytes (= 1048576 bytes).
-  (*) = requires change from current implementation
 Flow:           Data dimensions:                        Max size (fcomplex):                        Buffer:
 ===================================================================================================================
-(input)         [station][samples][pol]                 [48][196608][2]     =  72 MiB               A
+(input)         [station][sample][pol]                  [48][196608][2]     =  72 MiB               A
   |                                                    (i16complex)
   V
 IntToFloat + Transpose
-   |            [station][pol][samples]                 [48][2][196608]     = 144 MiB               B
+   |            [station][pol][sample]                  [48][2][196608]     = 144 MiB               B
+   V
+FFT-shift {inplace}
+   |            [station][pol][sample]                  [48][2][196608]     = 144 MiB               B
   V
 FFT-64 {inplace}
-   |            [station][pol][samples][channel]        [48][2][3072][64]   = 144 MiB               B
+   |            [station][pol][sample][channel]         [48][2][3072][64]   = 144 MiB               B
+   V
+Delay compensation + Transpose {I/O: delays}
+   |            [station][pol][channel][sample]         [48][2][64][3072]   = 144 MiB               A
   V
-Delay compensation (*: no transpose) {I/O: delays}
+FFT-shift {inplace}
-   |            [station][pol][channel][samples]        [48][2][64][3072]   = 144 MiB               A
+   |            [station][pol][channel][sample]         [48][2][64][3072]   = 144 MiB               A
   V
 FFT-64 {inplace}
-   |            [station][pol][chan1][samples][chan2]   [48][2][64][48][64] = 144 MiB               A
+   |            [station][pol][chan1][sample][chan2]    [48][2][64][48][64] = 144 MiB               A
   V
 BandPass + Transpose {I/O: weights}
-   |            [station][chan1][chan2][samples][pol]   [48][64][64][48][2] = 144 MiB               B
+   |            [station][chan1][chan2][sample][pol]    [48][64][64][48][2] = 144 MiB               B
-   V          = [stations][channel][samples][pol]
+   V          = [station][channel][sample][pol]
   X
 Complex Voltages/Coherent Stokes:
 -----------------------------------
-   X            [station][channel][samples][pol]        [48][4096][48][2]   = 144 MiB               B
+   X            [station][channel][sample][pol]         [48][4096][48][2]   = 144 MiB               B
   |
   V
 BeamFormer {I/O: weights}
-   |            [channel][samples][tab][pol]            [4096][48][tab][2]  = 3 MiB/TAB             A
+   |            [channel][sample][tab][pol]             [4096][48][tab][2]  = 3 MiB/TAB             A
   V
 Transpose  
-   |            [tab][pol][samples][channel]            [tab][2][48][4096]  = 3 MiB/TAB             1ch: CS: C, CV: D
+   |            [tab][pol][sample][channel]             [tab][2][48][4096]  = 3 MiB/TAB             1ch: CS: C, CV: D
   |                                                                                                Nch: CS: D, CV: C
   V
 iFFT-4k {inplace}
-   |            [tab][pol][samples]                     [tab][2][196608]    = 3 MiB/TAB             1ch: CS: C, CV: D
+   |            [tab][pol][sample]                      [tab][2][196608]    = 3 MiB/TAB             1ch: CS: C, CV: D
+   |                                                                                                Nch: CS: D, CV: C
+   V
+FFT-shift {inplace}
+   |            [tab][pol][sample]                      [tab][2][196608]    = 3 MiB/TAB             1ch: CS: C, CV: D
   |                                                                                                Nch: CS: D, CV: C
   V
 FIR-16 (if >1ch)
-   |            [tab][pol][samples]                     [tab][2][196608]    = 3 MiB/TAB             1ch: CS: -, CV: -
+   |            [tab][pol][sample]                      [tab][2][196608]    = 3 MiB/TAB             1ch: CS: -, CV: -
   |                                                                                                Nch: CS: C, CV: D
   V
 FFT-16 {inplace} (if >1ch)
-   |            [tab][pol][samples][channel]            [tab][2][12288][16] = 3 MiB/TAB             1ch: CS: -, CV: -
+   |            [tab][pol][sample][channel]             [tab][2][12288][16] = 3 MiB/TAB             1ch: CS: -, CV: -
   |                                                                                                Nch: CS: C, CV: D
   V
-Coherent Stokes (*: no transpose)
+Coherent Stokes
-   |            [tab][stokes][samples][channel]         [tab][4][12288][16] = 0.75 MiB/TAB/Stokes   1ch: CS: D, CV: -
+   |            [tab][stokes][sample][channel]          [tab][4][12288][16] = 0.75 MiB/TAB/Stokes   1ch: CS: D, CV: -
   |                                                    (float)                                     Nch: CS: D, CV: -
   V
 (output)
 Incoherent Stokes:
 -----------------------------------
-   X            [station][channel][samples][pol]        [48][4096][48][2]   = 144 MiB               B
+   X            [station][channel][sample][pol]         [48][4096][48][2]   = 144 MiB               B
   |
   V
 Transpose + Copy
-   |            [station][pol][samples][channel]        [48][2][48][4096]   = 144 MiB               A
+   |            [station][pol][sample][channel]         [48][2][48][4096]   = 144 MiB               A
   V
 iFFT-4k {inplace}
-   |            [station][pol][samples]                 [48][2][196608]     = 144 MiB               A
+   |            [station][pol][sample]                  [48][2][196608]     = 144 MiB               A
+   V
+FFT-shift {inplace}
+   |            [station][pol][sample]                  [48][2][196608]     = 144 MiB               A
   V
 FIR-16 (if >1ch) 
-   |            [station][pol][samples]                 [48][2][196608]     = 144 MiB               B
+   |            [station][pol][sample]                  [48][2][196608]     = 144 MiB               B
   |
   V
 FFT-16 {inplace} (if >1ch)
-   |            [station][pol][samples][channel]        [48][2][12288][16]  = 144 MiB               B
+   |            [station][pol][sample][channel]         [48][2][12288][16]  = 144 MiB               B
-   |
   V
-Incoherent Stokes (*: no transpose)
+Incoherent Stokes
-   |            [stokes][samples][channel]              [4][12288][16]      = 3 MiB                 E
+   |            [stokes][sample][channel]               [4][12288][16]      = 3 MiB                 E
   V                                                    (float)
 (output)

--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc
@@ -34,7 +34,7 @@ namespace LOFAR
        delayCompensation(delayCompensationParams(ps)),
        beamFormer(beamFormerParams(ps)),
        coherentTranspose(coherentTransposeParams(ps)),
-        fftShiftKernel(FFTShiftKernelParams(ps)),
+        fftShift(fftShiftParams(ps)),
        firFilter(firFilterParams(ps, nrSubbandsPerSubbandProc)),
        coherentStokes(coherentStokesParams(ps)),
        incoherentStokes(incoherentStokesParams(ps)),
@@ -129,7 +129,7 @@ namespace LOFAR
      }
      FFTShiftKernel::Parameters
-      BeamFormerFactories::FFTShiftKernelParams(const Parset &ps) const
+      BeamFormerFactories::fftShiftParams(const Parset &ps) const
      {
        FFTShiftKernel::Parameters params(ps);
        // Currently a static in the subband proc

--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h
@@ -50,7 +50,7 @@ namespace LOFAR
      KernelFactory<DelayAndBandPassKernel> delayCompensation;
      KernelFactory<BeamFormerKernel> beamFormer;
      KernelFactory<CoherentStokesTransposeKernel> coherentTranspose;
-      KernelFactory<FFTShiftKernel> fftShiftKernel;
+      KernelFactory<FFTShiftKernel> fftShift;
      KernelFactory<FIR_FilterKernel> firFilter;
      KernelFactory<CoherentStokesKernel> coherentStokes;
      KernelFactory<IncoherentStokesKernel> incoherentStokes;
@@ -74,7 +74,7 @@ namespace LOFAR
      delayCompensationParams(const Parset &ps) const;
      FFTShiftKernel::Parameters
-        FFTShiftKernelParams(const Parset &ps) const;
+      fftShiftParams(const Parset &ps) const;
      FIR_FilterKernel::Parameters
      firFilterParams(const Parset &ps, size_t nrSubbandsPerSubbandProc) const;

--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc
@@ -98,6 +98,11 @@ namespace LOFAR
      intToFloatBuffers(devInput.inputSamples, devB),
      intToFloatKernel(factories.intToFloat.create(queue, intToFloatBuffers)),
+      // FFTShift: B -> B
+      firstFFTShiftBuffers(devB, devB),
+      firstFFTShiftKernel(
+        factories.fftShift.create(queue, firstFFTShiftBuffers)),
      // FFT: B -> B
      firstFFT(queue,
        ps.settings.beamFormer.nrDelayCompensationChannels,
@@ -112,6 +117,11 @@ namespace LOFAR
      delayCompensationKernel(
        factories.delayCompensation.create(queue, delayCompensationBuffers)),
+      // FFTShift: A -> A
+      secondFFTShiftBuffers(devA, devA),
+      secondFFTShiftKernel(
+        factories.fftShift.create(queue, secondFFTShiftBuffers)),
      // FFT: A -> A
      secondFFT(queue,
        ps.settings.beamFormer.nrHighResolutionChannels /
@@ -167,6 +177,12 @@ namespace LOFAR
         ps.settings.beamFormer.nrHighResolutionChannels),
         false, coherentTransposeBuffers.output),
+      // fftshift: C/D -> C/D (in-place) = transposeBuffers.output
+      inverseFFTShiftBuffers(
+        coherentTransposeBuffers.output, coherentTransposeBuffers.output),
+      inverseFFTShiftKernel(
+        factories.fftShift.create(queue, inverseFFTShiftBuffers)),
      // FIR filter: D/C -> C/D
      //
      // Input buffer:
@@ -225,6 +241,11 @@ namespace LOFAR
         ps.settings.beamFormer.nrHighResolutionChannels),
        false, devA),
+      // inverse FFTShift: A -> A
+      incoherentInverseFFTShiftBuffers(devA, devA),
+      incoherentInverseFFTShiftKernel(
+        factories.fftShift.create(queue, incoherentInverseFFTShiftBuffers)),
      // FIR filter: A -> B
      devIncoherentFilterWeights(
        context,
@@ -335,17 +356,21 @@ namespace LOFAR
    BeamFormerSubbandProc::Counters::Counters(gpu::Context &context)
      :
    intToFloat(context),
+    firstFFTShift(context),
    firstFFT(context),
    delayBp(context),
+    secondFFTShift(context),
    secondFFT(context),
    correctBandpass(context),
    beamformer(context),
    transpose(context),
    inverseFFT(context),
+    inverseFFTShift(context),
    firFilterKernel(context),
    finalFFT(context),
    coherentStokes(context),
    incoherentInverseFFT(context),
+    incoherentInverseFFTShift(context),
    incoherentFirFilterKernel(context),
    incoherentFinalFFT(context),
    incoherentStokes(context),
@@ -362,13 +387,16 @@ namespace LOFAR
      LOG_INFO_STR(
        "**** BeamFormerSubbandProc GPU mean and stDev ****" << endl <<
        std::setw(20) << "(intToFloat)" << intToFloat.stats << endl <<
+        std::setw(20) << "(firstFFTShift)" << firstFFTShift.stats << endl <<
        std::setw(20) << "(firstFFT)" << firstFFT.stats << endl <<
        std::setw(20) << "(delayBp)" << delayBp.stats << endl <<
+        std::setw(20) << "(secondFFTShift)" << secondFFTShift.stats << endl <<
        std::setw(20) << "(secondFFT)" << secondFFT.stats << endl <<
        std::setw(20) << "(correctBandpass)" << correctBandpass.stats << endl <<
        std::setw(20) << "(beamformer)" << beamformer.stats << endl <<
        std::setw(20) << "(transpose)" << transpose.stats << endl <<
        std::setw(20) << "(inverseFFT)" << inverseFFT.stats << endl <<
+        std::setw(20) << "(inverseFFTShift)" << inverseFFTShift.stats << endl <<
        std::setw(20) << "(firFilterKernel)" << firFilterKernel.stats << endl <<
        std::setw(20) << "(finalFFT)" << finalFFT.stats << endl <<
        std::setw(20) << "(coherentStokes)" << coherentStokes.stats << endl <<
@@ -376,6 +404,7 @@ namespace LOFAR
        std::setw(20) << "(visibilities)" << visibilities.stats << endl <<
        std::setw(20) << "(incoherentOutput )" << incoherentOutput.stats << endl <<
        std::setw(20) << "(incoherentInverseFFT)" << incoherentInverseFFT.stats << endl <<
+        std::setw(20) << "(incoherentInverseFFTShift)" << incoherentInverseFFTShift.stats << endl <<
        std::setw(20) << "(incoherentFirFilterKernel)" << incoherentFirFilterKernel.stats << endl <<
        std::setw(20) << "(incoherentFinalFFT)" << incoherentFinalFFT.stats << endl <<
        std::setw(20) << "(incoherentStokes)" << incoherentStokes.stats << endl <<
@@ -419,6 +448,7 @@ namespace LOFAR
      // Otherwise, a kernel arg may not be set...
      intToFloatKernel->enqueue(input.blockID, counters.intToFloat);
+      firstFFTShiftKernel->enqueue(input.blockID, counters.firstFFTShift);
      firstFFT.enqueue(input.blockID, counters.firstFFT);
      delayCompensationKernel->enqueue(
@@ -426,6 +456,7 @@ namespace LOFAR
        ps.settings.subbands[subband].centralFrequency,
        ps.settings.subbands[subband].SAP);
+      secondFFTShiftKernel->enqueue(input.blockID, counters.secondFFTShift);
      secondFFT.enqueue(input.blockID, counters.secondFFT);
      bandPassCorrectionKernel->enqueue(
@@ -440,7 +471,9 @@ namespace LOFAR
          ps.settings.subbands[subband].SAP);
        coherentTransposeKernel->enqueue(input.blockID, counters.transpose);
        inverseFFT.enqueue(input.blockID, counters.inverseFFT);
+        inverseFFTShiftKernel->enqueue(input.blockID, counters.inverseFFTShift);
        if (coherentStokesPPF) 
        {
@@ -464,6 +497,8 @@ namespace LOFAR
        incoherentInverseFFT.enqueue(
          input.blockID, counters.incoherentInverseFFT);
+        incoherentInverseFFTShiftKernel->enqueue(
+          input.blockID, counters.incoherentInverseFFTShift);
        if (incoherentStokesPPF) 
        {

--- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h
+++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h
@@ -37,6 +37,7 @@
 #include <GPUProc/Kernels/CoherentStokesTransposeKernel.h>
 #include <GPUProc/Kernels/CoherentStokesKernel.h>
 #include <GPUProc/Kernels/DelayAndBandPassKernel.h>
+#include <GPUProc/Kernels/FFTShiftKernel.h>
 #include <GPUProc/Kernels/FFT_Kernel.h>
 #include <GPUProc/Kernels/FIR_FilterKernel.h>
 #include <GPUProc/Kernels/IncoherentStokesKernel.h>
@@ -90,18 +91,22 @@ namespace LOFAR
        // gpu kernel counters
        PerformanceCounter intToFloat;
+        PerformanceCounter firstFFTShift;
        PerformanceCounter firstFFT;
        PerformanceCounter delayBp;
+        PerformanceCounter secondFFTShift;
        PerformanceCounter secondFFT;
        PerformanceCounter correctBandpass;
        PerformanceCounter beamformer;
        PerformanceCounter transpose;
        PerformanceCounter inverseFFT;
+        PerformanceCounter inverseFFTShift;
        PerformanceCounter firFilterKernel;
        PerformanceCounter finalFFT;
        PerformanceCounter coherentStokes;
        PerformanceCounter incoherentInverseFFT;
+        PerformanceCounter incoherentInverseFFTShift;
        PerformanceCounter incoherentFirFilterKernel;
        PerformanceCounter incoherentFinalFFT;
        PerformanceCounter incoherentStokes;
@@ -149,6 +154,10 @@ namespace LOFAR
      IntToFloatKernel::Buffers intToFloatBuffers;
      std::auto_ptr<IntToFloatKernel> intToFloatKernel;
+      // First FFT-shift
+      FFTShiftKernel::Buffers firstFFTShiftBuffers;
+      std::auto_ptr<FFTShiftKernel> firstFFTShiftKernel;
      // First (64 points) FFT
      FFT_Kernel firstFFT;
@@ -156,6 +165,10 @@ namespace LOFAR
      DelayAndBandPassKernel::Buffers delayCompensationBuffers;
      std::auto_ptr<DelayAndBandPassKernel> delayCompensationKernel;
+      // Second FFT-shift
+      FFTShiftKernel::Buffers secondFFTShiftBuffers;
+      std::auto_ptr<FFTShiftKernel> secondFFTShiftKernel;
      // Second (64 points) FFT
      FFT_Kernel secondFFT;
@@ -182,6 +195,10 @@ namespace LOFAR
      // inverse (4k points) FFT
      FFT_Kernel inverseFFT;
+      // inverse FFT-shift
+      FFTShiftKernel::Buffers inverseFFTShiftBuffers;
+      std::auto_ptr<FFTShiftKernel> inverseFFTShiftKernel;
      // Poly-phase filter (FIR + FFT)
      gpu::DeviceMemory devFilterWeights;
      gpu::DeviceMemory devFilterHistoryData;
@@ -205,6 +222,10 @@ namespace LOFAR
      // Inverse (4k points) FFT
      FFT_Kernel incoherentInverseFFT;
+      // Inverse FFT-shift
+      FFTShiftKernel::Buffers incoherentInverseFFTShiftBuffers;
+      std::auto_ptr<FFTShiftKernel> incoherentInverseFFTShiftKernel;
      // Poly-phase filter (FIR + FFT)
      gpu::DeviceMemory devIncoherentFilterWeights;
      gpu::DeviceMemory devIncoherentFilterHistoryData;

--- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc
+++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc
@@ -23,6 +23,7 @@
 #include <complex>
 #include <cmath>
+#include <iomanip>
 #include <Common/LofarLogger.h>
 #include <CoInterface/Parset.h>
@@ -39,16 +40,20 @@ using namespace LOFAR::TYPES;
 template<typename T> T inputSignal(size_t t)
 {
  size_t nrBits = sizeof(T) / 2 * 8;
-  double freq = 1.0 / 4.0; // in samples
-  // double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples
  double amp = (1 << (nrBits - 1)) - 1;
+#if 1 // Toggle to experiment with pulse like input
+  // Sine wave
+  // double freq = 1.0 / 4.0; // in samples
+  double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples
  double angle = (double)t * 2.0 * M_PI * freq;
  double s = ::sin(angle);
  double c = ::cos(angle);
  return T(::round(amp * c), ::round(amp * s));
+#else
+  // Pulse train
+  if (t % (2 * 64 + 17) == 0) return T(amp);
+  else return T(0);
+#endif
 }
 int main() {
@@ -152,7 +157,7 @@ int main() {
  // Block number: 0 .. inf
  in.blockID.block = 0;
- // Subband index in the observation: [0, ps.nrSubbands())
+  // Subband index in the observation: [0, ps.nrSubbands())
  in.blockID.globalSubbandIdx = 0;
  // Subband index for this pipeline/workqueue: [0, subbandIndices.size())
@@ -206,9 +211,10 @@ int main() {
  for (size_t s = 0; s < nrStokes; s++)
    for (size_t t = 0; t < nrSamples; t++)
      for (size_t c = 0; c < nrChannels; c++)
-        ASSERTSTR(fpEquals(out[0][s][t][c], outVal), 
+        ASSERTSTR(fpEquals(out[0][s][t][c], outVal, 1e-4f), 
                  "out[" << s << "][" << t << "][" << c << "] = " << 
-                  out[0][s][t][c] << "; outVal = " << outVal);
+                  setprecision(12) << out[0][s][t][c] << 
+                  "; outVal = " << outVal);
  return 0;
 }

--- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCoherentStokesBeamFormerSubbandProcProcessSb.cc
+++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCoherentStokesBeamFormerSubbandProcProcessSb.cc
@@ -40,16 +40,20 @@ using namespace LOFAR::TYPES;
 template<typename T> T inputSignal(size_t t)
 {
  size_t nrBits = sizeof(T) / 2 * 8;
-  double freq = 1.0 / 4.0; // in samples
-  // double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples
  double amp = (1 << (nrBits - 1)) - 1;
+#if 1  // Toggle to experiment with pulse type input
+  // Sine wave
+  // double freq = 1.0 / 4.0; // in samples
+  double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples
  double angle = (double)t * 2.0 * M_PI * freq;
  double s = ::sin(angle);
  double c = ::cos(angle);
  return T(::round(amp * c), ::round(amp * s));
+#else
+  // Pulse train
+  if (t % (2 * 64 + 17) == 0) return T(amp);
+  else return T(0);
+#endif
 }
 int main() {
@@ -219,7 +223,7 @@ int main() {
    for (size_t t = 0; t < nrSamples; t++)
    for (size_t c = 0; c < nrChannels; c++)
    {
-      ASSERTSTR(fpEquals(out[tab][s][t][c], outVal),
+      ASSERTSTR(fpEquals(out[tab][s][t][c], outVal, 1e-4f),
        "out[" << tab << "][" << s << "][" << t << "][" << c << "] = " << setprecision(12) <<
        out[tab][s][t][c] << "; outVal = " << outVal);
    }