diff --git a/RTCP/Cobalt/GPUProc/doc/pipeline-buffers.txt b/RTCP/Cobalt/GPUProc/doc/pipeline-buffers.txt index 347929e4e1f1424c800351af09d694074594fa1f..07312c924071f2234805705cfcdce9ab57f6b3fa 100644 --- a/RTCP/Cobalt/GPUProc/doc/pipeline-buffers.txt +++ b/RTCP/Cobalt/GPUProc/doc/pipeline-buffers.txt @@ -57,6 +57,8 @@ Correlator V (output) +If 1ch, IntToFloat (A->E) is used instead of FIR+FFT. + (BF) Preprocessing [A -> B, trashes A] ----------------------------------- NB: The key Cobalt.BeamFormer.stationList can be used to select a subset of antenna fields in the observation diff --git a/RTCP/Cobalt/GPUProc/share/gpu/kernels/DelayAndBandPass.cu b/RTCP/Cobalt/GPUProc/share/gpu/kernels/DelayAndBandPass.cu index 62ef0e89e40a45fbd729e955005375e73a1d8121..3ae289a1258685063d07fcc77493c790ae049ee9 100644 --- a/RTCP/Cobalt/GPUProc/share/gpu/kernels/DelayAndBandPass.cu +++ b/RTCP/Cobalt/GPUProc/share/gpu/kernels/DelayAndBandPass.cu @@ -65,31 +65,7 @@ typedef fcomplex(*OutputDataType)[NR_STATIONS][NR_CHANNELS][NR_SAMPLES_PER_CHAN typedef fcomplex(*OutputDataType)[NR_STATIONS][NR_POLARIZATIONS][NR_CHANNELS][NR_SAMPLES_PER_CHANNEL]; #endif -//# TODO: Unify #dims in input type to 4: [NR_SAMPLES_PER_SUBBAND] -> [NR_SAMPLES_PER_CHANNEL][NR_CHANNELS] (see kernel test) -//# It is technically incorrect, but different dims for the same input type is a real pain to use/supply. -//# Also unify order of #chn, #sampl to [NR_SAMPLES_PER_CHANNEL][NR_CHANNELS] -#ifdef INPUT_IS_STATIONDATA -# if NR_BITS_PER_SAMPLE == 16 -typedef short_complex rawSampleType; -typedef short_complex(*InputDataType)[NR_STATIONS][NR_SAMPLES_PER_SUBBAND][NR_POLARIZATIONS]; -#define REAL(sample) sample.x -#define IMAG(sample) sample.y -# elif NR_BITS_PER_SAMPLE == 8 -typedef char_complex rawSampleType; -typedef char_complex(*InputDataType)[NR_STATIONS][NR_SAMPLES_PER_SUBBAND][NR_POLARIZATIONS]; -#define REAL(sample) sample.x -#define IMAG(sample) sample.y -# elif NR_BITS_PER_SAMPLE == 4 -typedef signed char rawSampleType; -typedef signed char (*InputDataType)[NR_STATIONS][NR_SAMPLES_PER_SUBBAND][NR_POLARIZATIONS]; -#define REAL(sample) extractRI(sample, false) -#define IMAG(sample) extractRI(sample, true) -# else -# error unsupported NR_BITS_PER_SAMPLE -# endif -#else typedef fcomplex(*InputDataType)[NR_STATIONS][NR_POLARIZATIONS][NR_SAMPLES_PER_CHANNEL][NR_CHANNELS]; -#endif typedef const double(*DelaysType)[NR_SAPS][NR_DELAYS][NR_POLARIZATIONS]; // 2 Polarizations; in seconds typedef const double2(*Phase0sType)[NR_STATIONS]; // 2 Polarizations; in radians typedef const float(*BandPassFactorsType)[NR_CHANNELS]; @@ -124,9 +100,7 @@ inline __device__ fcomplex sincos_d2f(double phi) * of ::complex (2 complex polarizations) * @param[in] filteredDataPtr pointer to input data; this can either be a * 4D array [station][polarization][sample][channel][complex] -* of ::fcomplex, or a 2D array [station][subband][complex] -* of ::short_complex2 or ::char_complex2, -* depending on the value of @c NR_CHANNELS +* of ::fcomplex * @param[in] subbandFrequency center freqency of the subband * @param[in] beam index number of the beam * @param[in] delaysAtBeginPtr pointer to delay data of ::DelaysType, @@ -251,17 +225,8 @@ extern "C" { for (unsigned time = timeStart; time < NR_SAMPLES_PER_CHANNEL; time += timeInc) { -#ifdef INPUT_IS_STATIONDATA - const rawSampleType sampleXraw = (*inputData)[station][time][0]; - fcomplex sampleX = make_float2(convertIntToFloat(REAL(sampleXraw)), - convertIntToFloat(IMAG(sampleXraw))); - const rawSampleType sampleYraw = (*inputData)[station][time][1]; - fcomplex sampleY = make_float2(convertIntToFloat(REAL(sampleYraw)), - convertIntToFloat(IMAG(sampleYraw))); -#else fcomplex sampleX = (*inputData)[station][0][time][channel]; fcomplex sampleY = (*inputData)[station][1][time][channel]; -#endif #if defined DELAY_COMPENSATION // Offset of this sample between begin and end. diff --git a/RTCP/Cobalt/GPUProc/share/gpu/kernels/IntToFloat.cu b/RTCP/Cobalt/GPUProc/share/gpu/kernels/IntToFloat.cu index b8e002d8ee5299e1a90a58e72e1fc22b82be0cb8..f0a8fb035c35727cad3470df3cdd702eb4dd2af4 100644 --- a/RTCP/Cobalt/GPUProc/share/gpu/kernels/IntToFloat.cu +++ b/RTCP/Cobalt/GPUProc/share/gpu/kernels/IntToFloat.cu @@ -67,6 +67,8 @@ typedef float2 (*ConvertedDataType)[NR_OUTPUT_STATIONS][NR_POLARIZATIONS][NR * * Optional preprocessor symbols: * - DO_FFTSHIFT, if an fft-shift is to be performed as well + * - DO_STATIONSUBSET, if the stationIndices input array is to be used to select a subset + * of stations. * * Execution configuration: * - Use a 1D thread block. No restrictions. @@ -82,8 +84,13 @@ __global__ void intToFloat(void *convertedDataPtr, ConvertedDataType convertedData = (ConvertedDataType)convertedDataPtr; SampledDataType sampledData = (SampledDataType) sampledDataPtr; +#ifdef DO_STATIONSUBSET uint station_in = stationIndices[blockIdx.y]; uint station_out = blockIdx.y; +#else + uint station_in = blockIdx.y; + uint station_out = blockIdx.y; +#endif #ifdef DO_FFTSHIFT // Multiplication factor: 1 for even samples, -1 for odd samples diff --git a/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h b/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h index 5efc4d02212d4a0a6e7d5eae8ec049a431c19912..9fc75474dd12c2915322dc8f775ed1e15a27a0f6 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h +++ b/RTCP/Cobalt/GPUProc/src/cuda/KernelFactory.h @@ -99,8 +99,12 @@ namespace LOFAR { // Since we use overlapping input/output buffers, their size // could be larger than we need. - ASSERT(buffers.input.size() >= bufferSize(T::INPUT_DATA)); - ASSERT(buffers.output.size() >= bufferSize(T::OUTPUT_DATA)); + ASSERTSTR(buffers.input.size() >= bufferSize(T::INPUT_DATA), + "Require " << bufferSize(T::INPUT_DATA) << " bytes for input, " + "but buffer is only " << buffers.input.size() << " bytes."); + ASSERTSTR(buffers.output.size() >= bufferSize(T::OUTPUT_DATA), + "Require " << bufferSize(T::OUTPUT_DATA) << " bytes for output, " + "but buffer is only " << buffers.output.size() << " bytes."); return new T( stream, createModule(stream.getContext(), diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc index 05cbe311d80d9568bb0046f8bee28c3eed442eb8..6205b45ad6c2e935adaf8f71e52e622bacac7103 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc @@ -50,10 +50,6 @@ namespace LOFAR delayIndices(nrStations), nrDelays(ps.settings.antennaFieldNames.size()), nrBitsPerSample(ps.settings.nrBitsPerSample), - inputIsStationData(correlator && ps.settings.correlator.nrChannels == 1 - ? true - : false), - nrChannels(correlator ? ps.settings.correlator.nrChannels : ps.settings.beamFormer.nrDelayCompensationChannels), nrSamplesPerChannel(ps.settings.blockSize / nrChannels), @@ -94,9 +90,7 @@ namespace LOFAR unsigned DelayAndBandPassKernel::Parameters::nrBytesPerComplexSample() const { - return inputIsStationData - ? 2 * nrBitsPerSample / 8 - : sizeof(std::complex<float>); + return sizeof(std::complex<float>); } @@ -213,9 +207,6 @@ namespace LOFAR defs["NR_BITS_PER_SAMPLE"] = lexical_cast<string>(itsParameters.nrBitsPerSample); - if (itsParameters.inputIsStationData) - defs["INPUT_IS_STATIONDATA"] = "1"; - defs["NR_CHANNELS"] = lexical_cast<string>(itsParameters.nrChannels); defs["NR_SAMPLES_PER_CHANNEL"] = lexical_cast<string>(itsParameters.nrSamplesPerChannel); diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.h b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.h index d5bcd7bb673b5fa8eb789edab75c6abf9b9a1957..a2f051c6f0aef458ac27e3d96d3b58e9d50fb314 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.h +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.h @@ -60,7 +60,6 @@ namespace LOFAR std::vector<unsigned> delayIndices; unsigned nrDelays; unsigned nrBitsPerSample; - bool inputIsStationData; unsigned nrChannels; unsigned nrSamplesPerChannel; diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.cc index e29f6af6e5cf7bd86832eb27c1d59fa40bf72b80..57717698695df1aa0c066546f7b9f642ee10204b 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.cc @@ -45,17 +45,16 @@ namespace LOFAR string IntToFloatKernel::theirSourceFile = "IntToFloat.cu"; string IntToFloatKernel::theirFunction = "intToFloat"; - IntToFloatKernel::Parameters::Parameters(const Parset& ps) : + IntToFloatKernel::Parameters::Parameters(const Parset& ps, bool fftShift, bool beamFormerStationSubset) : Kernel::Parameters("intToFloat"), nrInputStations(ps.settings.antennaFields.size()), - stationIndices(ObservationSettings::AntennaFieldName::indices(ps.settings.beamFormer.antennaFieldNames, ps.settings.antennaFieldNames)), + stationIndices(beamFormerStationSubset ? ObservationSettings::AntennaFieldName::indices(ps.settings.beamFormer.antennaFieldNames, ps.settings.antennaFieldNames) : vector<unsigned>()), nrBitsPerSample(ps.settings.nrBitsPerSample), nrSamplesPerSubband(ps.settings.blockSize), - fftShift(ps.settings.beamFormer.nrDelayCompensationChannels > 1) + fftShift(fftShift), + doStationSubset(beamFormerStationSubset) { - ASSERTSTR(ps.settings.beamFormer.enabled, "IntToFloatKernel::Parameters assumes it will be used in the beamFormer, but the beamFormer is not enabled."); - dumpBuffers = ps.getBool("Cobalt.Kernels.IntToFloatKernel.dumpOutput", false); dumpFilePattern = @@ -133,6 +132,8 @@ namespace LOFAR if (itsParameters.fftShift) defs["DO_FFTSHIFT"] = "1"; + if (itsParameters.doStationSubset) + defs["DO_STATIONSUBSET"] = "1"; return defs; } diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.h b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.h index 3c212b974f6475565936841e9d0f127afef0aad7..434d6f31b097ee17b94aa685120b47f61e76e5a6 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.h +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/IntToFloatKernel.h @@ -51,7 +51,7 @@ namespace LOFAR // IntToFloatKernel class. struct Parameters : Kernel::Parameters { - Parameters(const Parset& ps); + Parameters(const Parset& ps, bool fftShift, bool beamFormerStationSubset); unsigned nrInputStations; std::vector<unsigned> stationIndices; // input station nr for ewch output station @@ -61,10 +61,11 @@ namespace LOFAR unsigned nrSamplesPerSubband; bool fftShift; + bool doStationSubset; size_t bufferSize(BufferType bufferType) const; - unsigned nrOutputStations() const { return stationIndices.size(); } + unsigned nrOutputStations() const { return doStationSubset ? stationIndices.size() : nrInputStations; } }; IntToFloatKernel(const gpu::Stream &stream, diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/ZeroingKernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/ZeroingKernel.cc index 4d0d9a1e8cab696c808f26a988f3d628f840c9ce..cb828e4eb085a3aa533e2715ed22362fff90adf2 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/ZeroingKernel.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/ZeroingKernel.cc @@ -108,6 +108,7 @@ namespace LOFAR // marshall flags to GPU host buffer computeMaskTimer.start(); for(unsigned station = 0; station < nrSTABs; ++station) { + LOG_DEBUG_STR("Flags for block " << blockId << ", station " << station << ": " << channelFlags[station]); channelFlags[station].toByteset(hostMask.get<char>() + station * nrSamplesPerChannel, nrSamplesPerChannel); } computeMaskTimer.stop(); diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerPreprocessingStep.cc b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerPreprocessingStep.cc index e85db40772492a8a63dfd4eabc20818ef7389670..a9aac05e5e1c927a5f538b3b92381ff5498701fc 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerPreprocessingStep.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerPreprocessingStep.cc @@ -37,7 +37,10 @@ namespace LOFAR namespace Cobalt { BeamFormerPreprocessingStep::Factories::Factories(const Parset &ps) : - intToFloat(ps), + intToFloat(IntToFloatKernel::Parameters( + ps, + ps.settings.beamFormer.nrDelayCompensationChannels > 1, + true)), firstFFT(FFT_Kernel::Parameters( ps.settings.beamFormer.nrDelayCompensationChannels, diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.cc b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.cc index b58796880974d8c4665ccf843e7c61bb5ba37269..e4d68345a01252163e97a7c743e6f0f88552087c 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.cc @@ -61,12 +61,15 @@ namespace LOFAR std::sqrt((double)ps.settings.correlator.nrChannels), "FIR (correlator)")) : NULL), - zeroing(ps.settings.correlator.nrChannels > 1 - ? new KernelFactory<ZeroingKernel>(ZeroingKernel::Parameters(ps, + + intToFloat(ps.settings.correlator.nrChannels == 1 + ? new KernelFactory<IntToFloatKernel>(IntToFloatKernel::Parameters(ps, false, false)) + : NULL), + + zeroing(ZeroingKernel::Parameters(ps, ps.settings.antennaFields.size(), ps.settings.correlator.nrChannels, - "Zeroing (correlator)")) - : NULL), + "Zeroing (correlator)")), delayAndBandPass(DelayAndBandPassKernel::Parameters(ps, true)), @@ -257,10 +260,8 @@ namespace LOFAR : ProcessStep(parset, i_queue), correlatorPPF(ps.settings.correlator.nrChannels > 1), - devE(context, correlatorPPF - ? std::max(factories.correlator.bufferSize(CorrelatorKernel::INPUT_DATA), - factories.correlator.bufferSize(CorrelatorKernel::OUTPUT_DATA)) - : factories.correlator.bufferSize(CorrelatorKernel::OUTPUT_DATA)), + devE(context, std::max(factories.correlator.bufferSize(CorrelatorKernel::INPUT_DATA), + factories.correlator.bufferSize(CorrelatorKernel::OUTPUT_DATA))), outputCounter(context, "output (correlator)"), integratedData(nrSubbandsPerSubbandProc) { @@ -273,14 +274,16 @@ namespace LOFAR // FFT: B -> E fftKernel = factories.fft->create(queue, *devB, devE); - - // Zeroing: E -> E - zeroingKernel = factories.zeroing->create(queue, devE, devE); + } else { + // intToFloat: A -> E + intToFloatKernel = factories.intToFloat->create(queue, *devA, devE); } - // Delay and Bandpass: A/E -> B - delayAndBandPassKernel = std::auto_ptr<DelayAndBandPassKernel>(factories.delayAndBandPass.create(queue, - correlatorPPF ? devE : *devA, *devB)); + // Zeroing: E -> E + zeroingKernel = factories.zeroing.create(queue, devE, devE); + + // Delay and Bandpass: E -> B + delayAndBandPassKernel = std::auto_ptr<DelayAndBandPassKernel>(factories.delayAndBandPass.create(queue, devE, *devB)); // Correlator: B -> E correlatorKernel = std::auto_ptr<CorrelatorKernel>(factories.correlator.create(queue, @@ -333,6 +336,11 @@ namespace LOFAR // Zero the output of each FFT that had flagged input samples zeroingKernel->enqueue(input.blockID, flagsPerChannel); + } else { + intToFloatKernel->enqueue(input.blockID); + + // Zero the flagged samples + zeroingKernel->enqueue(input.blockID, input.inputFlags); } // Even if we skip delay compensation and bandpass correction (rare), run diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.h b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.h index 048fb6f5fb5a114a812c8658b865ba76f4e85278..052c850cb21e3aa47c3d54880464b40db00224d7 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.h +++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/CorrelatorStep.h @@ -45,6 +45,7 @@ #include <GPUProc/Kernels/DelayAndBandPassKernel.h> #include <GPUProc/Kernels/FIR_FilterKernel.h> #include <GPUProc/Kernels/FFT_Kernel.h> +#include <GPUProc/Kernels/IntToFloatKernel.h> #include <GPUProc/Kernels/ZeroingKernel.h> #include <GPUProc/Kernels/CorrelatorKernel.h> @@ -61,7 +62,9 @@ namespace LOFAR SmartPtr< KernelFactory<FFT_Kernel> > fft; SmartPtr< KernelFactory<FIR_FilterKernel> > firFilter; - SmartPtr< KernelFactory<ZeroingKernel> > zeroing; + SmartPtr< KernelFactory<IntToFloatKernel> > intToFloat; + + KernelFactory<ZeroingKernel> zeroing; KernelFactory<DelayAndBandPassKernel> delayAndBandPass; @@ -148,6 +151,9 @@ namespace LOFAR // FFT SmartPtr<FFT_Kernel> fftKernel; + // IntToFloat (in case of no FFT) + SmartPtr<IntToFloatKernel> intToFloatKernel; + // Zeroing SmartPtr<ZeroingKernel> zeroingKernel; diff --git a/RTCP/Cobalt/GPUProc/test/Kernels/tIntToFloatKernel.cc b/RTCP/Cobalt/GPUProc/test/Kernels/tIntToFloatKernel.cc index 1736f9c0aee34b7547d224421ed7239b46a285eb..b7f6e4a8fa45f127f191c7055428d617a2fb4e5c 100644 --- a/RTCP/Cobalt/GPUProc/test/Kernels/tIntToFloatKernel.cc +++ b/RTCP/Cobalt/GPUProc/test/Kernels/tIntToFloatKernel.cc @@ -49,7 +49,7 @@ int main() { gpu::Stream stream(ctx); Parset ps("tIntToFloatKernel.in_parset"); - KernelFactory<IntToFloatKernel> factory(ps); + KernelFactory<IntToFloatKernel> factory(IntToFloatKernel::Parameters(ps, true, true)); size_t nSampledData = factory.bufferSize(IntToFloatKernel::INPUT_DATA) / sizeof(char); size_t sizeSampledData = nSampledData * sizeof(char); diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/CMakeLists.txt b/RTCP/Cobalt/GPUProc/test/SubbandProcs/CMakeLists.txt index c906aeb43a7312c84ad702ba7c63d033cc73a09c..4a693506fbf7af9f111e17e629077a78c26d2de3 100644 --- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/CMakeLists.txt +++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/CMakeLists.txt @@ -5,14 +5,13 @@ include(LofarCTest) if(UNITTEST++_FOUND) lofar_add_test(tCorrelatorSubbandProc tCorrelatorSubbandProc.cc) lofar_add_test(tCorrelatorStep tCorrelatorStep.cc) + lofar_add_test(tCorrelatorSubbandProcProcessSb tCorrelatorSubbandProcProcessSb.cc) endif() # This test is instable. Added to issue tracker: https://support.astron.nl/lofar_issuetracker/issues/5807 # Understand the proble lofar_add_test(tBeamFormerSubbandProcProcessSb tBeamFormerSubbandProcProcessSb.cc) -lofar_add_test(tCorrelatorSubbandProcProcessSb - tCorrelatorSubbandProcProcessSb.cc) lofar_add_test(tCoherentStokesBeamFormerSubbandProcProcessSb tCoherentStokesBeamFormerSubbandProcProcessSb.cc ../Kernels/KernelTestHelpers.cc) lofar_add_test(tFlysEyeBeamFormerSubbandProcProcessSb diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorSubbandProcProcessSb.cc b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorSubbandProcProcessSb.cc index bd14f401529520869f2da3a006c2cef9266c9ed9..21a4cc8eeeb5baa1e34d9fe243ffd30c04cbf5ac 100644 --- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorSubbandProcProcessSb.cc +++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tCorrelatorSubbandProcProcessSb.cc @@ -29,148 +29,240 @@ #include <GPUProc/SubbandProcs/KernelFactories.h> #include <GPUProc/SubbandProcs/SubbandProc.h> +#include <UnitTest++.h> + using namespace std; using namespace LOFAR; using namespace LOFAR::Cobalt; +struct SubbandProcWrapper { + // Create very simple kernel programs, with predictable output. Skip as much + // as possible. Nr of channels/sb from the parset is 1, so the PPF will not + // even run. Parset also has turned of delay compensation and bandpass + // correction (but that kernel will run to convert int to float and to + // transform the data order). + + gpu::Device device; + gpu::Context ctx; + + Parset ps; + KernelFactories factories; + SubbandProc cwq; + SubbandProcInputData in; + SubbandProcOutputData out; + + fcomplex inputValue; + + SubbandProcWrapper(Parset &ps): + device(0), + ctx(device), + ps(ps), + factories(ps, 1), + cwq(ps, ctx, factories), + in(ps.settings.SAPs.size(), + ps.settings.antennaFields.size(), + ps.settings.nrPolarisations, + ps.settings.beamFormer.maxNrTABsPerSAP(), + ps.settings.blockSize, + ps.nrBytesPerComplexSample(), + ctx), + out(ps, ctx), + + inputValue(1,1) + { + // Input info + const size_t nrBeams = ps.settings.SAPs.size(); + const size_t nrStations = ps.settings.antennaFields.size(); + const size_t nrPolarisations = ps.settings.nrPolarisations; + const size_t maxNrTABsPerSAP = ps.settings.beamFormer.maxNrTABsPerSAP(); + const size_t nrSamplesPerIntegration = ps.settings.correlator.nrSamplesPerIntegration(); + const size_t nrSamplesPerSubband = ps.settings.blockSize; + const size_t nrBitsPerSample = ps.settings.nrBitsPerSample; + const size_t nrBytesPerComplexSample = ps.nrBytesPerComplexSample(); + + // We only support 8-bit or 16-bit input samples + ASSERT(nrBitsPerSample == 8 || nrBitsPerSample == 16); + + // Output info + const size_t nrBaselines = ps.nrBaselines(); + const size_t nrBlocksPerIntegration = + ps.settings.correlator.nrBlocksPerIntegration; + const size_t nrChannelsPerSubband = ps.settings.correlator.nrChannels; + const size_t integrationSteps = ps.settings.correlator.nrSamplesPerIntegration(); + + LOG_INFO_STR( + "\nInput info:" << + "\n nrBeams = " << nrBeams << + "\n nrStations = " << nrStations << + "\n nrPolarisations = " << nrPolarisations << + "\n maxNrTABsPerSAP = " << maxNrTABsPerSAP << + "\n nrSamplesPerIntegration = " << nrSamplesPerIntegration << + "\n nrSamplesPerSubband = " << nrSamplesPerSubband << + "\n nrBitsPerSample = " << nrBitsPerSample << + "\n nrBytesPerComplexSample = " << nrBytesPerComplexSample << + "\n ----------------------------" << + "\n Total bytes = " << in.inputSamples.size()); + + LOG_INFO_STR( + "\nOutput info:" << + "\n nrBaselines = " << nrBaselines << + "\n nrBlockPerIntegration = " << nrBlocksPerIntegration << + "\n nrChannelsPerSubband = " << nrChannelsPerSubband << + "\n integrationSteps = " << integrationSteps << + "\n ----------------------------" << + "\n Total bytes = " << out.correlatedData.subblocks[0]->visibilities.size()); + + // Initialize synthetic input to all (1, 1). + for (size_t st = 0; st < nrStations; st++) + for (size_t i = 0; i < nrSamplesPerSubband; i++) + for (size_t pol = 0; pol < nrPolarisations; pol++) + setInputValue(st, i, pol, inputValue); + + // Initialize subbands partitioning administration (struct BlockID). We only + // do the 1st block of whatever. + in.blockID.block = 0; + in.blockID.globalSubbandIdx = 0; + in.blockID.localSubbandIdx = 0; + in.blockID.subbandProcSubbandIdx = 0; + + // Initialize delays. We skip delay compensation, but init anyway, + // so we won't copy uninitialized data to the device. + for (size_t i = 0; i < in.delaysAtBegin.size(); i++) + in.delaysAtBegin.get<float>()[i] = 0.0f; + for (size_t i = 0; i < in.delaysAfterEnd.size(); i++) + in.delaysAfterEnd.get<float>()[i] = 0.0f; + for (size_t i = 0; i < in.phase0s.size(); i++) + in.phase0s.get<float>()[i] = 0.0f; + } + + void setInputValue(unsigned station, unsigned t, unsigned pol, fcomplex value) { + const size_t nrBitsPerSample = ps.settings.nrBitsPerSample; + + switch(nrBitsPerSample) { + case 8: + reinterpret_cast<i8complex&>(in.inputSamples[station][t][pol][0]) = + i8complex(value); + break; + case 16: + reinterpret_cast<i16complex&>(in.inputSamples[station][t][pol][0]) = + i16complex(value); + break; + } + } + + void process() { + const ssize_t nrBlocksPerIntegration = + ps.settings.correlator.nrBlocksPerIntegration; + + ssize_t block(0); + + LOG_INFO("Processing ..."); + for (block = -1; block < nrBlocksPerIntegration; block++) { + LOG_DEBUG_STR("Processing block #" << block); + + in.blockID.block = block; + cwq.processSubband(in, out); + if (block >= 0) + cwq.postprocessSubband(out); + } + ASSERT(block == nrBlocksPerIntegration); + ASSERT(out.emit_correlatedData); + } + + fcomplex outputValue() const { + // The output is the correlation-product of two inputs (with identical + // `inputValue`). + + const size_t scaleFactor = ps.settings.nrBitsPerSample == 16 ? 1 : 16; + + return norm(inputValue) * scaleFactor * scaleFactor; + } + + void verifyOutput() const { + // we don't process the FFT in our reference calculations yet + ASSERT(ps.settings.correlator.nrChannels == 1); + + LOG_INFO("Verifying output ..."); + for (size_t b = 0; b < ps.nrBaselines(); b++) + for (size_t c = 0; c < ps.settings.correlator.nrChannels; c++) + for (size_t pol0 = 0; pol0 < ps.settings.nrPolarisations; pol0++) + for (size_t pol1 = 0; pol1 < ps.settings.nrPolarisations; pol1++) + ASSERTSTR(fpEquals(out.correlatedData.subblocks[0]->visibilities[b][c][pol0][pol1], outputValue()), + "out[" << b << "][" << c << "][" << pol0 << + "][" << pol1 << "] = " << out.correlatedData.subblocks[0]->visibilities[b][c][pol0][pol1] << + "; outputValue = " << outputValue()); + } +}; + +// Test the output on clean data +TEST(output_noflags) { + Parset ps("tCorrelatorSubbandProcProcessSb.parset"); + SubbandProcWrapper wrapper(ps); + + wrapper.process(); + wrapper.verifyOutput(); +} + +// Test the output if there is a flagged sample +TEST(output_flags) { + Parset ps("tCorrelatorSubbandProcProcessSb.parset"); + SubbandProcWrapper wrapper(ps); + + // Flag one sample + wrapper.in.inputFlags[0].include(13); + // Replace it with an extreme value to know whether it's actually skipped + wrapper.setInputValue(0, 13, 0, fcomplex(100,100)); + wrapper.setInputValue(0, 13, 1, fcomplex(100,100)); + + // process + wrapper.process(); + wrapper.verifyOutput(); +} + +// Test the final weights after FFT if there is flagged data +TEST(weights_flags_64ch) { + // Override nr channels to 64 + Parset ps("tCorrelatorSubbandProcProcessSb.parset"); + ps.replace("Cobalt.Correlator.nrChannelsPerSubband", "64"); + ps.updateSettings(); + SubbandProcWrapper wrapper(ps); + + // Flag one sample + wrapper.in.inputFlags[0].include(13); + + // process + wrapper.process(); + + // We reuse the flagged sample in all of the blocks, so + // we actually lose "nrBlocks" samples. + const unsigned nrBlocks = ps.settings.correlator.nrBlocksPerIntegration; + const unsigned nrValidSamples = wrapper.ps.settings.correlator.nrSamplesPerIntegration(); + const unsigned nrValidSamplesFlagged = + nrValidSamples + // we lose "NR_TAPS" samples per block for each flagged sample + - nrBlocks * NR_TAPS; + + LOG_INFO("Verifying output weights ..."); + for (size_t b = 0; b < wrapper.ps.nrBaselines(); b++) + for (size_t c = 1; c < wrapper.ps.settings.correlator.nrChannels; c++) { + // baseline 0 and 1 contain station 0 with a flagged sample. + unsigned expected = b < 2 ? nrValidSamplesFlagged : nrValidSamples; + ASSERTSTR(wrapper.out.correlatedData.subblocks[0]->getNrValidSamples(b, c) == expected, + "nrValidSamples[" << b << "][" << c << "] = " + << wrapper.out.correlatedData.subblocks[0]->getNrValidSamples(b, c) + << "; expected " << expected); + } +} + int main() { INIT_LOGGER("tCorrelatorSubbandProcProcessSb"); try { gpu::Platform pf; - cout << "Detected " << pf.size() << " CUDA devices" << endl; + return UnitTest::RunAllTests() > 0; } catch (gpu::CUDAException& e) { cerr << e.what() << endl; return 3; } - - gpu::Device device(0); - vector<gpu::Device> devices(1, device); - gpu::Context ctx(device); - - Parset ps("tCorrelatorSubbandProcProcessSb.parset"); - - // Input info - const size_t nrBeams = ps.settings.SAPs.size(); - const size_t nrStations = ps.settings.antennaFields.size(); - const size_t nrPolarisations = ps.settings.nrPolarisations; - const size_t maxNrTABsPerSAP = ps.settings.beamFormer.maxNrTABsPerSAP(); - const size_t nrSamplesPerChannel = ps.settings.correlator.nrSamplesPerIntegration(); - const size_t nrSamplesPerSubband = ps.settings.blockSize; - const size_t nrBitsPerSample = ps.settings.nrBitsPerSample; - const size_t nrBytesPerComplexSample = ps.nrBytesPerComplexSample(); - const fcomplex inputValue(1,1); - - // We only support 8-bit or 16-bit input samples - ASSERT(nrBitsPerSample == 8 || nrBitsPerSample == 16); - - // Output info - const size_t nrBaselines = nrStations * (nrStations + 1) / 2; - const size_t nrBlocksPerIntegration = - ps.settings.correlator.nrBlocksPerIntegration; - const size_t nrChannelsPerSubband = ps.settings.correlator.nrChannels; - const size_t integrationSteps = ps.settings.correlator.nrSamplesPerIntegration(); - const size_t scaleFactor = nrBitsPerSample == 16 ? 1 : 16; - - // The output is the correlation-product of two inputs (with identical - // `inputValue`). - const fcomplex outputValue = - norm(inputValue) * scaleFactor * scaleFactor; - - // Create very simple kernel programs, with predictable output. Skip as much - // as possible. Nr of channels/sb from the parset is 1, so the PPF will not - // even run. Parset also has turned of delay compensation and bandpass - // correction (but that kernel will run to convert int to float and to - // transform the data order). - - KernelFactories factories(ps, 1); - SubbandProc cwq(ps, ctx, factories); - - SubbandProcInputData in( - nrBeams, nrStations, nrPolarisations, maxNrTABsPerSAP, - nrSamplesPerSubband, nrBytesPerComplexSample, ctx); - - SubbandProcOutputData out(ps, ctx); - - LOG_INFO_STR( - "\nInput info:" << - "\n nrBeams = " << nrBeams << - "\n nrStations = " << nrStations << - "\n nrPolarisations = " << nrPolarisations << - "\n maxNrTABsPerSAP = " << maxNrTABsPerSAP << - "\n nrSamplesPerChannel = " << nrSamplesPerChannel << - "\n nrSamplesPerSubband = " << nrSamplesPerSubband << - "\n nrBitsPerSample = " << nrBitsPerSample << - "\n nrBytesPerComplexSample = " << nrBytesPerComplexSample << - "\n inputValue = " << inputValue << - "\n ----------------------------" << - "\n Total bytes = " << in.inputSamples.size()); - - LOG_INFO_STR( - "\nOutput info:" << - "\n nrBaselines = " << nrBaselines << - "\n nrBlockPerIntegration = " << nrBlocksPerIntegration << - "\n nrChannelsPerSubband = " << nrChannelsPerSubband << - "\n integrationSteps = " << integrationSteps << - "\n scaleFactor = " << scaleFactor << - "\n outputValue = " << outputValue << - "\n ----------------------------" << - "\n Total bytes = " << out.correlatedData.subblocks[0]->visibilities.size()); - - // Initialize synthetic input to all (1, 1). - for (size_t st = 0; st < nrStations; st++) - for (size_t i = 0; i < nrSamplesPerSubband; i++) - for (size_t pol = 0; pol < nrPolarisations; pol++) - { - switch(nrBitsPerSample) { - case 8: - reinterpret_cast<i8complex&>(in.inputSamples[st][i][pol][0]) = - i8complex(inputValue); - break; - case 16: - reinterpret_cast<i16complex&>(in.inputSamples[st][i][pol][0]) = - i16complex(inputValue); - break; - } - } - - // Initialize subbands partitioning administration (struct BlockID). We only - // do the 1st block of whatever. - in.blockID.block = 0; - in.blockID.globalSubbandIdx = 0; - in.blockID.localSubbandIdx = 0; - in.blockID.subbandProcSubbandIdx = 0; - - // Initialize delays. We skip delay compensation, but init anyway, - // so we won't copy uninitialized data to the device. - for (size_t i = 0; i < in.delaysAtBegin.size(); i++) - in.delaysAtBegin.get<float>()[i] = 0.0f; - for (size_t i = 0; i < in.delaysAfterEnd.size(); i++) - in.delaysAfterEnd.get<float>()[i] = 0.0f; - for (size_t i = 0; i < in.phase0s.size(); i++) - in.phase0s.get<float>()[i] = 0.0f; - - size_t block(0); - - LOG_INFO("Processing ..."); - for (block = 0; block < nrBlocksPerIntegration && !out.emit_correlatedData; block++) { - LOG_DEBUG_STR("Processing block #" << block); - cwq.processSubband(in, out); - cwq.postprocessSubband(out); - } - ASSERT(block == nrBlocksPerIntegration); - - LOG_INFO("Verifying output ..."); - for (size_t b = 0; b < nrBaselines; b++) - for (size_t c = 0; c < nrChannelsPerSubband; c++) - for (size_t pol0 = 0; pol0 < nrPolarisations; pol0++) - for (size_t pol1 = 0; pol1 < nrPolarisations; pol1++) - ASSERTSTR(fpEquals(out.correlatedData.subblocks[0]->visibilities[b][c][pol0][pol1], outputValue), - "out[" << b << "][" << c << "][" << pol0 << - "][" << pol1 << "] = " << out.correlatedData.subblocks[0]->visibilities[b][c][pol0][pol1] << - "; outputValue = " << outputValue); - - LOG_INFO("Test OK"); - return 0; } diff --git a/RTCP/Cobalt/GPUProc/test/cuda/tDelayAndBandPass.cc b/RTCP/Cobalt/GPUProc/test/cuda/tDelayAndBandPass.cc index 2272df30b5d09e962de11fa975ca7ff0bd2a3a2e..f78dab5af26fdbd43884b80b0dadefefbaa0b571 100644 --- a/RTCP/Cobalt/GPUProc/test/cuda/tDelayAndBandPass.cc +++ b/RTCP/Cobalt/GPUProc/test/cuda/tDelayAndBandPass.cc @@ -59,7 +59,6 @@ const unsigned NR_POLARIZATIONS = 2; const unsigned NR_SAPS = 8; const double SUBBAND_BANDWIDTH = 0.0 * NR_CHANNELS; -const bool INPUT_IS_STATIONDATA = false; const bool BANDPASS_CORRECTION = true; const bool DELAY_COMPENSATION = false; const bool DO_TRANSPOSE = true; @@ -150,8 +149,6 @@ CompileDefinitions getDefaultCompileDefinitions() boost::lexical_cast<string>(NR_STATIONS); defs["NR_DELAYS"] = boost::lexical_cast<string>(NR_DELAYS); - if (INPUT_IS_STATIONDATA) - defs["INPUT_IS_STATIONDATA"] = "1"; defs["NR_CHANNELS"] = boost::lexical_cast<string>(NR_CHANNELS); @@ -178,9 +175,7 @@ CompileDefinitions getDefaultCompileDefinitions() return defs; } -// T is an LCS i*complex type, or complex<float> when #chnl > 1. // It is the value type of the data input array. -template <typename T> vector<fcomplex> runTest(const CompileDefinitions& compileDefs, double subbandFrequency, unsigned beam, @@ -192,15 +187,7 @@ vector<fcomplex> runTest(const CompileDefinitions& compileDefs, gpu::Context ctx(stream->getContext()); boost::scoped_ptr<MultiDimArrayHostBuffer<fcomplex, 4> > outputData; - boost::scoped_ptr<MultiDimArrayHostBuffer<T, 4> > inputData; - - if (compileDefs.find("INPUT_IS_STATIONDATA") == compileDefs.end()) { - // If input does not come from station, we'll read fcomplex. - ASSERT(sizeof(T) == sizeof(fcomplex)); - } else { - // If input does NOT come from station, NR_BITS_PER_SAMPLE needs to match T. - ASSERT(boost::lexical_cast<unsigned>(compileDefs.at("NR_BITS_PER_SAMPLE")) == 8 * sizeof(T) / 2); - } + boost::scoped_ptr<MultiDimArrayHostBuffer<fcomplex, 4> > inputData; if(compileDefs.find("DO_TRANSPOSE") != compileDefs.end()) outputData.reset( @@ -225,7 +212,7 @@ vector<fcomplex> runTest(const CompileDefinitions& compileDefs, unsigned nchnl = boost::lexical_cast<unsigned>(cit->second); if (nchnl == 1) // integer input data (FIR+FFT skipped) inputData.reset( - new MultiDimArrayHostBuffer<T, 4>(boost::extents + new MultiDimArrayHostBuffer<fcomplex, 4>(boost::extents [NR_STATIONS] [NR_SAMPLES_PER_CHANNEL] [NR_CHANNELS] @@ -317,7 +304,7 @@ TEST(BandPass) // The input samples are all ones. After correction, multiply with 2. // The first and the last complex values are retrieved. They should be scaled // with the bandPassFactor == 2 - vector<fcomplex> results(runTest<fcomplex>( + vector<fcomplex> results(runTest( defs, 0.0, // sb freq 0U, // beam @@ -342,7 +329,7 @@ TEST(Phase0s) defs["DELAY_COMPENSATION"] = "1"; defs["SUBBAND_BANDWIDTH"] = "1.0"; - vector<fcomplex> results(runTest<fcomplex>( + vector<fcomplex> results(runTest( defs, 1.0, // sb freq 0U, // beam @@ -371,7 +358,7 @@ SUITE(DelayCompensation) defs["DELAY_COMPENSATION"] = "1"; defs["SUBBAND_BANDWIDTH"] = "1.0"; - vector<fcomplex> results(runTest<fcomplex>( + vector<fcomplex> results(runTest( defs, 1.0, // sb freq 0U, // beam @@ -468,7 +455,7 @@ SUITE(DelayCompensation) defs["DELAY_COMPENSATION"] = "1"; defs["SUBBAND_BANDWIDTH"] = "1.0"; - vector<fcomplex> results(runTest<fcomplex>( + vector<fcomplex> results(runTest( defs, 1.0, // sb freq 0U, // beam @@ -549,7 +536,7 @@ TEST(AllAtOnce) defs["DELAY_COMPENSATION"] = "1"; defs["SUBBAND_BANDWIDTH"] = "1.0"; - vector<fcomplex> results(runTest<fcomplex>( + vector<fcomplex> results(runTest( defs, 1.0, // sb freq 0U, // beam