diff --git a/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt b/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt index 1354c2e361631b2781e94d293f1f2a43aa2528d7..fc638f25c4f72ea52a354950285d27442a668d67 100644 --- a/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt +++ b/RTCP/Cobalt/GPUProc/doc/bf-pipeline.txt @@ -100,7 +100,7 @@ FFT-16 {inplace} (if >1ch) | [station][pol][samples][channel] [48][2][12288][16] = 144 MiB B | V -Incoherent Stokes (*: no transpose) (*** kernel must be fixed: wrong input format ***) +Incoherent Stokes (*: no transpose) | [stokes][samples][channel] [4][12288][16] = 3 MiB E V (float) (output) diff --git a/RTCP/Cobalt/GPUProc/share/gpu/kernels/BandPassCorrection.cu b/RTCP/Cobalt/GPUProc/share/gpu/kernels/BandPassCorrection.cu index 61a95f5aa260ddf99d8472c5ec35037dca2a5052..10f68b42f871e5e049c0cb0f9d7af1f244c41e86 100644 --- a/RTCP/Cobalt/GPUProc/share/gpu/kernels/BandPassCorrection.cu +++ b/RTCP/Cobalt/GPUProc/share/gpu/kernels/BandPassCorrection.cu @@ -34,6 +34,7 @@ * - @c NR_CHANNELS_2: a multiple of 16 * - @c NR_SAMPLES_PER_CHANNEL: > a multiple of 16 * - @c NR_BITS_PER_SAMPLE: 8 or 16 + * - @c DO_BANDPASS_CORRECTION: if defined, perform bandpass correction */ #include "gpu_math.cuh" @@ -69,7 +70,8 @@ typedef const float (* BandPassFactorsType)[NR_CHANNELS_1 * NR_CHANNELS_2]; /** * This kernel performs on the input data: - * - Apply a bandpass correction to compensate for the errors introduced by the + * - If the preprocessor variable \c DO_BANDPASS_CORRECTION is defined, apply a + * bandpass correction to compensate for the errors introduced by the * polyphase filter that produced the subbands. This error is deterministic, * hence it can be fully compensated for. * - Transpose the data so that the samples for each channel are placed @@ -92,9 +94,11 @@ __global__ void bandPassCorrection( fcomplex * outputDataPtr, OutputDataType outputData = (OutputDataType) outputDataPtr; InputDataType inputData = (InputDataType) inputDataPtr; +#if defined DO_BANDPASS_CORRECTION // Band pass to apply to the channels BandPassFactorsType bandPassFactors = (BandPassFactorsType) bandPassFactorsPtr; - +#endif + // fasted dims unsigned chan2 = blockIdx.x * blockDim.x + threadIdx.x; unsigned sample = blockIdx.y * blockDim.y + threadIdx.y; @@ -109,17 +113,18 @@ __global__ void bandPassCorrection( fcomplex * outputDataPtr, for (unsigned idx_channel1 = 0; idx_channel1 < NR_CHANNELS_1; ++idx_channel1) { - // idx_channel1 steps with NR_CHANNELS_2 tru the channel weights - float weight((*bandPassFactors)[idx_channel1 * NR_CHANNELS_2 + chan2]); - // Read from global memory in the quickest dimension (optimal) fcomplex sampleX = (*inputData)[station][0][idx_channel1][sample][chan2]; fcomplex sampleY = (*inputData)[station][1][idx_channel1][sample][chan2]; - + +#if defined DO_BANDPASS_CORRECTION + // idx_channel1 steps with NR_CHANNELS_2 tru the channel weights + float weight((*bandPassFactors)[idx_channel1 * NR_CHANNELS_2 + chan2]); sampleX.x *= weight; sampleX.y *= weight; sampleY.x *= weight; sampleY.y *= weight; +#endif // Write the data to shared memory tmp[threadIdx.y][threadIdx.x][0] = sampleX; diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.cc index 5b72388f988c5fbe4a8313e16013c41ca265dc09..ee638a8ced69b66722b19ba6e6fe6cd25c3adad9 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.cc @@ -48,7 +48,10 @@ namespace LOFAR Kernel::Parameters(ps), nrBitsPerSample(ps.settings.nrBitsPerSample), nrBytesPerComplexSample(ps.nrBytesPerComplexSample()), - nrSAPs(ps.settings.SAPs.size()) + nrSAPs(ps.settings.SAPs.size()), + nrChannels1(64), // TODO: Must be read from parset? + nrChannels2(64), // TODO: Must be read from parset? + correctBandPass(ps.settings.corrections.bandPass) { dumpBuffers = ps.getBool("Cobalt.Kernels.BandPassCorrectionKernel.dumpOutput", false); @@ -77,7 +80,7 @@ namespace LOFAR nrBytesRead = nrBytesWritten = nrSamples * sizeof(std::complex<float>); gpu::HostMemory bpWeights(stream.getContext(), buffers.bandPassCorrectionWeights.size()); - BandPass::computeCorrectionFactors(bpWeights.get<float>(), params.nrChannelsPerSubband); + BandPass::computeCorrectionFactors(bpWeights.get<float>(), params.nrChannels1 * params.nrChannels2); stream.writeBuffer(buffers.bandPassCorrectionWeights, bpWeights, true); } @@ -125,6 +128,8 @@ namespace LOFAR lexical_cast<string>(itsParameters.nrChannels1); defs["NR_CHANNELS_2"] = lexical_cast<string>(itsParameters.nrChannels2); + if (itsParameters.correctBandPass) + defs["DO_BANDPASS_CORRECTION"] = "1"; return defs; } } diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.h b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.h index 2e8dbf86b59b842939f1dd6c6f2e8597ff231f75..556ba7564428e10638ff65d20c462ad59abeee2d 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.h +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/BandPassCorrectionKernel.h @@ -48,6 +48,7 @@ namespace LOFAR size_t nrSAPs; size_t nrChannels1; size_t nrChannels2; + bool correctBandPass; }; enum BufferType diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc index ad05e85795463acd3ada4f5b87e731534cd83051..b419476869311631f36644a477c13d8cf9061e35 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/DelayAndBandPassKernel.cc @@ -70,6 +70,13 @@ namespace LOFAR const Parameters& params) : Kernel(stream, gpu::Function(module, theirFunction), buffers, params) { + LOG_DEBUG_STR("DelayAndBandPassKernel: " << + "correctBandPass=" << + (params.correctBandPass ? "true" : "false") << + ", delayCompensation=" << + (params.delayCompensation ? "true" : "false") << + ", transpose=" << (params.transpose ? "true" : "false")); + ASSERT(params.nrChannelsPerSubband % 16 == 0 || params.nrChannelsPerSubband == 1); ASSERT(params.nrSamplesPerChannel % 16 == 0); diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/FFT_Kernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/FFT_Kernel.cc index 1f7b6362f38f229c336fdafa83231221ce34d0b6..59f1d71013778caf12fc66033b22ad197e425529 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/FFT_Kernel.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/FFT_Kernel.cc @@ -46,6 +46,10 @@ namespace LOFAR buffer(buffer), itsStream(stream) { + LOG_DEBUG_STR("FFT_Kernel: " << + "fftSize=" << fftSize << + ", direction=" << (forward ? "forward" : "inverse") << + ", nrFFTs=" << nrFFTs); } void FFT_Kernel::enqueue(const BlockID &blockId, diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.cc index 6e217c691c35805472b27d7394797b2b12500b7f..f4ad933533cdb9c0cd11221228ed4d0466c91e64 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.cc @@ -58,7 +58,8 @@ namespace LOFAR const Parameters ¶ms) : gpu::Function(function), - maxThreadsPerBlock(stream.getContext().getDevice().getMaxThreadsPerBlock()), + maxThreadsPerBlock( + stream.getContext().getDevice().getMaxThreadsPerBlock()), itsStream(stream), itsBuffers(buffers), itsParameters(params) @@ -71,27 +72,37 @@ namespace LOFAR function.getAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS)); } - void Kernel::setEnqueueWorkSizes(gpu::Grid globalWorkSize, gpu::Block localWorkSize) + void Kernel::setEnqueueWorkSizes(gpu::Grid globalWorkSize, + gpu::Block localWorkSize) { gpu::Grid grid; ostringstream errMsgs; - // Enforce by the hardware supported work sizes to see errors clearly and early. + // Enforce by the hardware supported work sizes to see errors clearly and + // early. - gpu::Block maxLocalWorkSize = itsStream.getContext().getDevice().getMaxBlockDims(); + gpu::Block maxLocalWorkSize = + itsStream.getContext().getDevice().getMaxBlockDims(); if (localWorkSize.x > maxLocalWorkSize.x || localWorkSize.y > maxLocalWorkSize.y || localWorkSize.z > maxLocalWorkSize.z) - errMsgs << " - localWorkSize must be at most " << maxLocalWorkSize << endl; - - if (localWorkSize.x * localWorkSize.y * localWorkSize.z > maxThreadsPerBlock) - errMsgs << " - localWorkSize total must be at most " << maxThreadsPerBlock << " threads/block" << endl; - - // globalWorkSize may (in theory) be all zero (no work). Reject such localWorkSize. - if (localWorkSize.x == 0 || localWorkSize.y == 0 || localWorkSize.z == 0) { + errMsgs << " - localWorkSize must be at most " << maxLocalWorkSize + << endl; + + if (localWorkSize.x * localWorkSize.y * localWorkSize.z > + maxThreadsPerBlock) + errMsgs << " - localWorkSize total must be at most " + << maxThreadsPerBlock << " threads/block" << endl; + + // globalWorkSize may (in theory) be all zero (no work). Reject such + // localWorkSize. + if (localWorkSize.x == 0 || + localWorkSize.y == 0 || + localWorkSize.z == 0) { errMsgs << " - localWorkSize components must be non-zero" << endl; } else { - // TODO: to globalWorkSize in terms of localWorkSize (CUDA) ('gridWorkSize'). + // TODO: to globalWorkSize in terms of localWorkSize (CUDA) + // ('gridWorkSize'). if (globalWorkSize.x % localWorkSize.x != 0 || globalWorkSize.y % localWorkSize.y != 0 || globalWorkSize.z % localWorkSize.z != 0) @@ -100,19 +111,21 @@ namespace LOFAR globalWorkSize.y / localWorkSize.y, globalWorkSize.z / localWorkSize.z); - gpu::Grid maxGridWorkSize = itsStream.getContext().getDevice().getMaxGridDims(); + gpu::Grid maxGridWorkSize = + itsStream.getContext().getDevice().getMaxGridDims(); if (grid.x > maxGridWorkSize.x || grid.y > maxGridWorkSize.y || grid.z > maxGridWorkSize.z) - errMsgs << " - globalWorkSize / localWorkSize must be at most " << maxGridWorkSize << endl; + errMsgs << " - globalWorkSize / localWorkSize must be at most " + << maxGridWorkSize << endl; } - string errStr(errMsgs.str()); if (!errStr.empty()) - THROW(gpu::GPUException, "setEnqueueWorkSizes(): unsupported globalWorkSize " << - globalWorkSize << " and/or localWorkSize " << localWorkSize << " selected:" << - endl << errStr); + THROW(gpu::GPUException, + "setEnqueueWorkSizes(): unsupported globalWorkSize " << + globalWorkSize << " and/or localWorkSize " << localWorkSize << + " selected:" << endl << errStr); LOG_DEBUG_STR("CUDA Grid size: " << grid); LOG_DEBUG_STR("CUDA Block size: " << localWorkSize); diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.h b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.h index 701842f84bbe0deb33b3fb683f25597ee8ea5951..591f8a7f333e1f6d98373ead29a5523a688e2e17 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.h +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.h @@ -41,7 +41,9 @@ namespace LOFAR { public: // Parameters that must be passed to the constructor of this Kernel class. - // TODO: more at constructor passed immediates can be turned into defines (blockDim/gridDim too if enforced fixed (consider conditional define) or drop opt) + // TODO: more at constructor passed immediates can be turned into defines + // (blockDim/gridDim too if enforced fixed (consider conditional define) + // or drop opt) struct Parameters { Parameters(const Parset& ps); @@ -85,7 +87,8 @@ namespace LOFAR // Explicit destructor, because the implicitly generated one is public. ~Kernel(); - void setEnqueueWorkSizes(gpu::Grid globalWorkSize, gpu::Block localWorkSize); + void setEnqueueWorkSizes(gpu::Grid globalWorkSize, + gpu::Block localWorkSize); const unsigned maxThreadsPerBlock; diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Pipelines/BeamFormerPipeline.cc b/RTCP/Cobalt/GPUProc/src/cuda/Pipelines/BeamFormerPipeline.cc index 6b12ced4112103f2774144c877da897bdbef49a5..e514450fe4cf0034c533247d1eea7dcfeff0426a 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Pipelines/BeamFormerPipeline.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/Pipelines/BeamFormerPipeline.cc @@ -112,7 +112,6 @@ namespace LOFAR samples += proc->counters.samples.stats; visibilities += proc->counters.visibilities.stats; - copyBuffers += proc->counters.copyBuffers.stats; incoherentOutput += proc->counters.incoherentOutput.stats; } diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc index 63381eaa60305dd5e999e77a9d1835d4daa6176d..45532308ec28f281ec4a75ca07e1091d519a1fe0 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.cc @@ -32,15 +32,15 @@ namespace LOFAR size_t nrSubbandsPerSubbandProc) : intToFloat(ps), delayCompensation(delayCompensationParams(ps)), - correctBandPass(correctBandPassParams(ps)), beamFormer(beamFormerParams(ps)), transpose(transposeParams(ps)), firFilter(firFilterParams(ps, nrSubbandsPerSubbandProc)), coherentStokes(coherentStokesParams(ps)), incoherentStokes(incoherentStokesParams(ps)), - incoherentFirFilter(incoherentFirFilterParams(ps, nrSubbandsPerSubbandProc)), + incoherentStokesTranspose(incoherentStokesTransposeParams(ps)), + incoherentFirFilter( + incoherentFirFilterParams(ps, nrSubbandsPerSubbandProc)), bandPassCorrection(bandPassCorrectionParams(ps)) - { } @@ -74,21 +74,6 @@ namespace LOFAR return params; } - DelayAndBandPassKernel::Parameters - BeamFormerFactories::correctBandPassParams(const Parset &ps) const - { - DelayAndBandPassKernel::Parameters params(ps); - params.nrChannelsPerSubband = - BeamFormerSubbandProc::BEAM_FORMER_NR_CHANNELS; - params.nrSamplesPerChannel = - ps.nrSamplesPerSubband() / - BeamFormerSubbandProc::BEAM_FORMER_NR_CHANNELS; - params.delayCompensation = false; - params.transpose = true; - - return params; - } - BeamFormerKernel::Parameters BeamFormerFactories::beamFormerParams(const Parset &ps) const { @@ -158,15 +143,15 @@ namespace LOFAR { FIR_FilterKernel::Parameters params(ps); - params.nrSTABs = ps.settings.beamFormer.maxNrTABsPerSAP(); + params.nrSTABs = ps.nrStations(); - // define at least 16 channels to get the FIR_Filter.cu to compile, even - // if we won't use it. - params.nrChannelsPerSubband = std::max(16U, - ps.settings.beamFormer.incoherentSettings.nrChannels); + params.nrChannelsPerSubband = + ps.settings.beamFormer.incoherentSettings.nrChannels; - // time integration has not taken place yet, so calculate the nrSamples manually - params.nrSamplesPerChannel = ps.nrSamplesPerSubband() / params.nrChannelsPerSubband; + // time integration has not taken place yet, so calculate the nrSamples + // manually + params.nrSamplesPerChannel = + ps.nrSamplesPerSubband() / params.nrChannelsPerSubband; params.nrSubbands = nrSubbandsPerSubbandProc; @@ -178,12 +163,26 @@ namespace LOFAR incoherentStokesParams(const Parset &ps) const { IncoherentStokesKernel::Parameters params(ps); - //TODO: beamformer params - params.nrChannelsPerSubband = ps.settings.beamFormer.incoherentSettings.nrChannels; - params.nrSamplesPerChannel = ps.nrSamplesPerSubband() / params.nrChannelsPerSubband; + params.nrChannelsPerSubband = + ps.settings.beamFormer.incoherentSettings.nrChannels; + params.nrSamplesPerChannel = + ps.nrSamplesPerSubband() / params.nrChannelsPerSubband; return params; } + IncoherentStokesTransposeKernel::Parameters + BeamFormerFactories:: + incoherentStokesTransposeParams(const Parset &ps) const + { + IncoherentStokesTransposeKernel::Parameters params(ps); + params.nrChannelsPerSubband = + BeamFormerSubbandProc::BEAM_FORMER_NR_CHANNELS; + params.nrSamplesPerChannel = + ps.nrSamplesPerSubband() / + BeamFormerSubbandProc::BEAM_FORMER_NR_CHANNELS; + + return params; + } } } diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h index 813f52d356b040c1fcac16e4dfc890bcc4ce4f74..ccc2875c261f1bbc131032dbbae1b6ed076be313 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h +++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerFactories.h @@ -23,6 +23,7 @@ #define LOFAR_GPUPROC_CUDA_BEAM_FORMER_FACTORIES_H #include <GPUProc/KernelFactory.h> +#include <GPUProc/Kernels/BandPassCorrectionKernel.h> #include <GPUProc/Kernels/BeamFormerKernel.h> #include <GPUProc/Kernels/BeamFormerTransposeKernel.h> #include <GPUProc/Kernels/CoherentStokesKernel.h> @@ -30,7 +31,7 @@ #include <GPUProc/Kernels/FIR_FilterKernel.h> #include <GPUProc/Kernels/IntToFloatKernel.h> #include <GPUProc/Kernels/IncoherentStokesKernel.h> -#include <GPUProc/Kernels/BandPassCorrectionKernel.h> +#include <GPUProc/Kernels/IncoherentStokesTransposeKernel.h> namespace LOFAR { @@ -46,12 +47,12 @@ namespace LOFAR KernelFactory<IntToFloatKernel> intToFloat; KernelFactory<DelayAndBandPassKernel> delayCompensation; - KernelFactory<DelayAndBandPassKernel> correctBandPass; KernelFactory<BeamFormerKernel> beamFormer; KernelFactory<BeamFormerTransposeKernel> transpose; KernelFactory<FIR_FilterKernel> firFilter; KernelFactory<CoherentStokesKernel> coherentStokes; KernelFactory<IncoherentStokesKernel> incoherentStokes; + KernelFactory<IncoherentStokesTransposeKernel> incoherentStokesTranspose; KernelFactory<FIR_FilterKernel> incoherentFirFilter; KernelFactory<BandPassCorrectionKernel> bandPassCorrection; @@ -67,9 +68,6 @@ namespace LOFAR CoherentStokesKernel::Parameters coherentStokesParams(const Parset &ps) const; - DelayAndBandPassKernel::Parameters - correctBandPassParams(const Parset &ps) const; - DelayAndBandPassKernel::Parameters delayCompensationParams(const Parset &ps) const; @@ -83,6 +81,9 @@ namespace LOFAR IncoherentStokesKernel::Parameters incoherentStokesParams(const Parset &ps) const; + IncoherentStokesTransposeKernel::Parameters + incoherentStokesTransposeParams(const Parset &ps) const; + }; diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc index 32d0d04e09ec5c8928bc4bf688312fab3676f226..d9f0baaab2a05890d3c63990afe86b0e1b4f7b7b 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.cc @@ -25,6 +25,7 @@ #include <GPUProc/global_defines.h> #include <GPUProc/gpu_wrapper.h> +#include <GPUProc/gpu_utils.h> // for dumpBuffer() #include <CoInterface/Parset.h> #include <ApplCommon/PosixTime.h> @@ -70,19 +71,21 @@ namespace LOFAR // FIR, which the beam former does in a later stage! devInput( std::max( - factories.intToFloat.bufferSize(IntToFloatKernel::INPUT_DATA), + factories.intToFloat.bufferSize(IntToFloatKernel::OUTPUT_DATA), factories.beamFormer.bufferSize(BeamFormerKernel::OUTPUT_DATA)), factories.delayCompensation.bufferSize( DelayAndBandPassKernel::DELAYS), factories.delayCompensation.bufferSize( DelayAndBandPassKernel::PHASE_OFFSETS), - context), + context), // coherent stokes buffers devA(devInput.inputSamples), devB(context, devA.size()), // Buffers for incoherent stokes devC(context, devA.size()), devD(context, devA.size()), + devE(context, factories.incoherentStokes.bufferSize( + IncoherentStokesKernel::OUTPUT_DATA)), devNull(context, 1), // intToFloat: input -> B @@ -113,15 +116,19 @@ namespace LOFAR // bandPass: A -> B devBandPassCorrectionWeights( context, - factories.correctBandPass.bufferSize( - DelayAndBandPassKernel::BAND_PASS_CORRECTION_WEIGHTS)), - correctBandPassBuffers( - devA, devB, devNull, devNull, devNull, devBandPassCorrectionWeights), - correctBandPassKernel( - factories.correctBandPass.create(queue, correctBandPassBuffers)), + factories.bandPassCorrection.bufferSize( + BandPassCorrectionKernel::BAND_PASS_CORRECTION_WEIGHTS)), + bandPassCorrectionBuffers( + devA, devB, devBandPassCorrectionWeights), + bandPassCorrectionKernel( + factories.bandPassCorrection.create(queue, bandPassCorrectionBuffers)), //************************************************************** //coherent stokes + outputComplexVoltages( + ps.settings.beamFormer.coherentSettings.type == STOKES_XXYY), + coherentStokesPPF(ps.settings.beamFormer.coherentSettings.nrChannels > 1), + // beamForm: B -> A // TODO: support >1 SAP devBeamFormerDelays( @@ -130,89 +137,128 @@ namespace LOFAR beamFormerBuffers(devB, devA, devBeamFormerDelays), beamFormerKernel(factories.beamFormer.create(queue, beamFormerBuffers)), - // transpose after beamforming: A -> B - transposeBuffers(devA, devB), + // transpose after beamforming: A -> C/D + // + // Output buffer: + // 1ch: CS: C, CV: D + // PPF: CS: D, CV: C + transposeBuffers( + devA, outputComplexVoltages ^ coherentStokesPPF ? devD : devC), transposeKernel(factories.transpose.create(queue, transposeBuffers)), - // inverse FFT: B -> B + // inverse FFT: C/D -> C/D (in-place) = transposeBuffers.output inverseFFT( queue, BEAM_FORMER_NR_CHANNELS, (ps.settings.beamFormer.maxNrTABsPerSAP() * NR_POLARIZATIONS * ps.nrSamplesPerSubband() / BEAM_FORMER_NR_CHANNELS), - false, devB), - - // FIR filter: B -> A + false, transposeBuffers.output), + + // FIR filter: D/C -> C/D + // + // Input buffer: + // 1ch: CS: -, CV: - (no FIR will be done) + // PPF: CS: D, CV: C = transposeBuffers.output + // + // Output buffer: + // 1ch: CS: -, CV: - (no FIR will be done) + // PPF: CS: C, CV: D = transposeBuffers.input devFilterWeights( context, factories.firFilter.bufferSize(FIR_FilterKernel::FILTER_WEIGHTS)), devFilterHistoryData( context, factories.firFilter.bufferSize(FIR_FilterKernel::HISTORY_DATA)), - firFilterBuffers(devB, devA, devFilterWeights, devFilterHistoryData), + firFilterBuffers( + transposeBuffers.output, transposeBuffers.input, + devFilterWeights, devFilterHistoryData), firFilterKernel(factories.firFilter.create(queue, firFilterBuffers)), - // final FFT: A -> A + // final FFT: C/D -> C/D (in-place) = firFilterBuffers.output finalFFT( queue, ps.settings.beamFormer.coherentSettings.nrChannels, (ps.settings.beamFormer.maxNrTABsPerSAP() * NR_POLARIZATIONS * ps.nrSamplesPerSubband() / ps.settings.beamFormer.coherentSettings.nrChannels), - true, devA), + true, firFilterBuffers.output), - // coherentStokes: 1ch: A -> B, Nch: B -> A + // coherentStokes: C -> D + // + // 1ch: input comes from inverseFFT in C + // Nch: input comes from finalFFT in C coherentStokesBuffers( - ps.settings.beamFormer.coherentSettings.nrChannels > 1 ? devA : devB, - ps.settings.beamFormer.coherentSettings.nrChannels > 1 ? devB : devA), + devC, + devD), coherentStokesKernel( factories.coherentStokes.create(queue, coherentStokesBuffers)), - // result buffer - devResult(ps.settings.beamFormer.coherentSettings.nrChannels > 1 ? devB : devA), //************************************************************** //incoherent stokes - // TODO: Add a transpose - // inverse FFT: C -> C - incoherentInverseFFT(queue, BEAM_FORMER_NR_CHANNELS, - NR_POLARIZATIONS * - ps.nrSamplesPerSubband() / BEAM_FORMER_NR_CHANNELS, false, devC), - - // FIR filter: C -> D - // TODO: provide history samples separately - // TODO: do a FIR for each individual TAB!! - devIncoherentFilterWeights(context, - factories.incoherentFirFilter.bufferSize(FIR_FilterKernel::FILTER_WEIGHTS)), - devIncoherentFilterHistoryData(context, - factories.incoherentFirFilter.bufferSize(FIR_FilterKernel::HISTORY_DATA)), - incoherentFirFilterBuffers(devC, devD, - devIncoherentFilterWeights, devIncoherentFilterHistoryData), - incoherentFirFilterKernel( - factories.incoherentFirFilter.create( - queue, incoherentFirFilterBuffers)), + incoherentStokesPPF( + ps.settings.beamFormer.incoherentSettings.nrChannels > 1), - // final FFT: D -> D - incoherentFinalFFT(queue, ps.settings.beamFormer.incoherentSettings.nrChannels, - NR_POLARIZATIONS * ps.nrSamplesPerSubband() / - ps.settings.beamFormer.incoherentSettings.nrChannels, true, devD), + // Transpose: B -> A + incoherentTransposeBuffers(devB, devA), - // incoherentstokes kernel + incoherentTranspose( + factories.incoherentStokesTranspose.create( + queue, incoherentTransposeBuffers)), + + // inverse FFT: A -> A + incoherentInverseFFT( + queue, BEAM_FORMER_NR_CHANNELS, + (ps.nrStations() * NR_POLARIZATIONS * + ps.nrSamplesPerSubband() / BEAM_FORMER_NR_CHANNELS), + false, devA), + + // FIR filter: A -> B + devIncoherentFilterWeights( + context, + factories.incoherentFirFilter.bufferSize( + FIR_FilterKernel::FILTER_WEIGHTS)), + + devIncoherentFilterHistoryData( + context, + factories.incoherentFirFilter.bufferSize( + FIR_FilterKernel::HISTORY_DATA)), + + incoherentFirFilterBuffers( + devA, devB, devIncoherentFilterWeights, devIncoherentFilterHistoryData), + + incoherentFirFilterKernel( + factories.incoherentFirFilter.create( + queue, incoherentFirFilterBuffers)), + + // final FFT: B -> B + incoherentFinalFFT( + queue, ps.settings.beamFormer.incoherentSettings.nrChannels, + (ps.nrStations() * NR_POLARIZATIONS * ps.nrSamplesPerSubband() / + ps.settings.beamFormer.incoherentSettings.nrChannels), + true, devB), + + // incoherentstokes kernel: A/B -> E + // + // 1ch: input comes from incoherentInverseFFT in A + // Nch: input comes from incoherentFinalFFT in B incoherentStokesBuffers( - ps.settings.beamFormer.incoherentSettings.nrChannels > 1 ? devD : devC, - ps.settings.beamFormer.incoherentSettings.nrChannels > 1 ? devC : devD), - incoherentStokesKernel( - factories.incoherentStokes.create(queue, incoherentStokesBuffers)), + incoherentStokesPPF ? devB : devA, + devE), - devIncoherentStokes(ps.settings.beamFormer.incoherentSettings.nrChannels > 1 ? devC : devD) + incoherentStokesKernel( + factories.incoherentStokes.create(queue, incoherentStokesBuffers)) { - // initialize history data + // initialize history data for both coherent and incoherent stokes. devFilterHistoryData.set(0); + devIncoherentFilterHistoryData.set(0); // TODO For now we only allow pure coherent and incoherent runs // count the number of coherent and incoherent saps size_t nrCoherent = 0; size_t nrIncoherent = 0; - for (size_t idx_sap = 0; idx_sap < ps.settings.beamFormer.SAPs.size(); ++idx_sap) + for (size_t idx_sap = 0; + idx_sap < ps.settings.beamFormer.SAPs.size(); + ++idx_sap) { if (ps.settings.beamFormer.SAPs[idx_sap].nrIncoherent != 0) nrIncoherent++; @@ -223,12 +269,17 @@ namespace LOFAR // raise exception if the parset contained an incorrect configuration if (nrCoherent != 0 && nrIncoherent != 0) THROW(GPUProcException, - "Parset contained both incoherent and coherent stokes SAPS. This is not supported"); + "Parset contained both incoherent and coherent stokes SAPS. " + "This is not supported"); if (nrCoherent) coherentBeamformer = true; else coherentBeamformer = false; + + LOG_INFO_STR("Running " + << (coherentBeamformer ? "a coherent" : "an incoherent") + << " Stokes beamformer pipeline"); // put enough objects in the outputPool to operate for (size_t i = 0; i < nrOutputElements(); ++i) @@ -240,16 +291,18 @@ namespace LOFAR new BeamFormedData( (ps.settings.beamFormer.maxNrTABsPerSAP() * ps.settings.beamFormer.coherentSettings.nrStokes), - ps.settings.beamFormer.coherentSettings.nrChannels, + ps.settings.beamFormer.coherentSettings.nrChannels, ps.settings.beamFormer.coherentSettings.nrSamples( ps.nrSamplesPerSubband()), - context)); + context)); else - outputPool.free.append(new BeamFormedData( - ps.settings.beamFormer.incoherentSettings.nrStokes, - ps.settings.beamFormer.incoherentSettings.nrChannels, - ps.settings.beamFormer.incoherentSettings.nrSamples(ps.nrSamplesPerSubband()), - context)); + outputPool.free.append( + new BeamFormedData( + ps.settings.beamFormer.incoherentSettings.nrStokes, + ps.settings.beamFormer.incoherentSettings.nrChannels, + ps.settings.beamFormer.incoherentSettings.nrSamples( + ps.nrSamplesPerSubband()), + context)); } //// CPU timers are set by CorrelatorPipeline @@ -283,25 +336,23 @@ namespace LOFAR incoherentFirFilterKernel(context), incoherentFinalFFT(context), incoherentStokes(context), + incoherentStokesTranspose(context), samples(context), visibilities(context), - copyBuffers(context), incoherentOutput(context) { } void BeamFormerSubbandProc::Counters::printStats() { - // Print the individual counter stats: mean and stDev - LOG_INFO_STR("**** BeamFormerSubbandProc GPU mean and stDev ****" << endl << + LOG_INFO_STR( + "**** BeamFormerSubbandProc GPU mean and stDev ****" << endl << std::setw(20) << "(intToFloat)" << intToFloat.stats << endl << std::setw(20) << "(firstFFT)" << firstFFT.stats << endl << std::setw(20) << "(delayBp)" << delayBp.stats << endl << std::setw(20) << "(secondFFT)" << secondFFT.stats << endl << - std::setw(20) << "(correctBandpass)" << correctBandpass.stats << endl << - - + std::setw(20) << "(correctBandpass)" << correctBandpass.stats << endl << std::setw(20) << "(beamformer)" << beamformer.stats << endl << std::setw(20) << "(transpose)" << transpose.stats << endl << std::setw(20) << "(inverseFFT)" << inverseFFT.stats << endl << @@ -310,21 +361,19 @@ namespace LOFAR std::setw(20) << "(coherentStokes)" << coherentStokes.stats << endl << std::setw(20) << "(samples)" << samples.stats << endl << std::setw(20) << "(visibilities)" << visibilities.stats << endl << - - std::setw(20) << "(copyBuffers)" << copyBuffers.stats << endl << std::setw(20) << "(incoherentOutput )" << incoherentOutput.stats << endl << std::setw(20) << "(incoherentInverseFFT)" << incoherentInverseFFT.stats << endl << std::setw(20) << "(incoherentFirFilterKernel)" << incoherentFirFilterKernel.stats << endl << std::setw(20) << "(incoherentFinalFFT)" << incoherentFinalFFT.stats << endl << - std::setw(20) << "(incoherentStokes)" << incoherentStokes.stats << endl ); - + std::setw(20) << "(incoherentStokes)" << incoherentStokes.stats << endl << + std::setw(20) << "(incoherentStokesTranspose)" << incoherentStokesTranspose.stats << endl); } void BeamFormerSubbandProc::processSubband(SubbandProcInputData &input, StreamableData &_output) { BeamFormedData &output = dynamic_cast<BeamFormedData&>(_output); - BeamFormedData &incoherentOutput = static_cast<BeamFormedData&>(_output); + BeamFormedData &incoherentOutput = dynamic_cast<BeamFormedData&>(_output); size_t block = input.blockID.block; unsigned subband = input.blockID.globalSubbandIdx; @@ -358,19 +407,22 @@ namespace LOFAR intToFloatKernel->enqueue(input.blockID, counters.intToFloat); firstFFT.enqueue(input.blockID, counters.firstFFT); + dumpBuffer(devB, "firstFFT.output.dat"); + delayCompensationKernel->enqueue( input.blockID, counters.delayBp, ps.settings.subbands[subband].centralFrequency, ps.settings.subbands[subband].SAP); + dumpBuffer(delayCompensationBuffers.output, + "delayCompensation.output.dat"); secondFFT.enqueue(input.blockID, counters.secondFFT); - correctBandPassKernel->enqueue( - input.blockID, counters.correctBandpass, - ps.settings.subbands[subband].centralFrequency, - ps.settings.subbands[subband].SAP); + dumpBuffer(devA, "secondFFT.output.dat"); - // TODO: To allow the copy of data to new buffer we need a sync here? - queue.copyBuffer(devC, devB, counters.copyBuffers, true); + bandPassCorrectionKernel->enqueue( + input.blockID, counters.correctBandpass); + dumpBuffer(bandPassCorrectionBuffers.output, + "bandPassCorrection.output.dat"); // ******************************************************************** // coherent stokes kernels @@ -384,40 +436,59 @@ namespace LOFAR inverseFFT.enqueue(input.blockID, counters.inverseFFT); - if (ps.settings.beamFormer.coherentSettings.nrChannels > 1) + if (coherentStokesPPF) { firFilterKernel->enqueue(input.blockID, counters.firFilterKernel, input.blockID.subbandProcSubbandIdx); finalFFT.enqueue(input.blockID, counters.finalFFT); } - - coherentStokesKernel->enqueue(input.blockID, counters.coherentStokes); + + if (!outputComplexVoltages) + { + coherentStokesKernel->enqueue(input.blockID, counters.coherentStokes); + } } else { // ******************************************************************** // incoherent stokes kernels - incoherentInverseFFT.enqueue(input.blockID, counters.incoherentInverseFFT); + incoherentTranspose->enqueue( + input.blockID, counters.incoherentStokesTranspose); - if (ps.settings.beamFormer.incoherentSettings.nrChannels > 1) + incoherentInverseFFT.enqueue( + input.blockID, counters.incoherentInverseFFT); + dumpBuffer(devA, "inverseFFT.output.dat"); + + if (incoherentStokesPPF) { - incoherentFirFilterKernel->enqueue(input.blockID, counters.incoherentFirFilterKernel, + incoherentFirFilterKernel->enqueue( + input.blockID, counters.incoherentFirFilterKernel, input.blockID.subbandProcSubbandIdx); - incoherentFinalFFT.enqueue(input.blockID, counters.incoherentFinalFFT); + + incoherentFinalFFT.enqueue( + input.blockID, counters.incoherentFinalFFT); + dumpBuffer(devB, "finalFFT.output.dat"); } - incoherentStokesKernel->enqueue(input.blockID, counters.incoherentStokes); + incoherentStokesKernel->enqueue( + input.blockID, counters.incoherentStokes); + // TODO: Propagate flags } + queue.synchronize(); + // Output in devD and devE, by design. if (coherentBeamformer) - queue.readBuffer(output, devResult, counters.visibilities, true); + queue.readBuffer( + output, devD, counters.visibilities, true); else - queue.readBuffer(incoherentOutput, devIncoherentStokes, counters.incoherentOutput, true); + queue.readBuffer( + incoherentOutput, devE, + counters.incoherentOutput, true); - // ************************************************ + // ************************************************ // Perform performance statistics if needed if (gpuProfiling) { @@ -431,10 +502,9 @@ namespace LOFAR counters.correctBandpass.logTime(); counters.samples.logTime(); - counters.copyBuffers.logTime(); if (coherentBeamformer) { - if (ps.settings.beamFormer.coherentSettings.nrChannels > 1) + if (coherentStokesPPF) { counters.firFilterKernel.logTime(); counters.finalFFT.logTime(); @@ -443,13 +513,19 @@ namespace LOFAR counters.beamformer.logTime(); counters.transpose.logTime(); counters.inverseFFT.logTime(); - counters.coherentStokes.logTime(); + + if (!outputComplexVoltages) + { + counters.coherentStokes.logTime(); + } + counters.visibilities.logTime(); } else { + counters.incoherentStokesTranspose.logTime(); counters.incoherentInverseFFT.logTime(); - if (ps.settings.beamFormer.incoherentSettings.nrChannels > 1) + if (incoherentStokesPPF) { counters.incoherentFirFilterKernel.logTime(); counters.incoherentFinalFFT.logTime(); diff --git a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h index 5a3a4e7c2be1b37a1f1c312cd0e2132b793fde1c..58f113bca48e9724c294008de5a5dd107a33d680 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h +++ b/RTCP/Cobalt/GPUProc/src/cuda/SubbandProcs/BeamFormerSubbandProc.h @@ -35,10 +35,12 @@ #include <GPUProc/Kernels/IntToFloatKernel.h> #include <GPUProc/Kernels/FFT_Kernel.h> #include <GPUProc/Kernels/DelayAndBandPassKernel.h> +#include <GPUProc/Kernels/BandPassCorrectionKernel.h> #include <GPUProc/Kernels/BeamFormerKernel.h> #include <GPUProc/Kernels/BeamFormerTransposeKernel.h> #include <GPUProc/Kernels/CoherentStokesKernel.h> #include <GPUProc/Kernels/IncoherentStokesKernel.h> +#include <GPUProc/Kernels/IncoherentStokesTransposeKernel.h> #include <GPUProc/Kernels/FIR_FilterKernel.h> #include "SubbandProc.h" @@ -105,12 +107,12 @@ namespace LOFAR PerformanceCounter incoherentFirFilterKernel; PerformanceCounter incoherentFinalFFT; PerformanceCounter incoherentStokes; + PerformanceCounter incoherentStokesTranspose; // gpu transfer counters PerformanceCounter samples; PerformanceCounter visibilities; - PerformanceCounter copyBuffers; PerformanceCounter incoherentOutput; // Print the mean and std of each performance counter on the logger void printStats(); @@ -129,10 +131,14 @@ namespace LOFAR // in the InputData class SubbandProcInputData::DeviceBuffers devInput; + // @{ + // Device memory buffers. These buffers are used interleaved. gpu::DeviceMemory devA; gpu::DeviceMemory devB; gpu::DeviceMemory devC; - gpu::DeviceMemory devD; + gpu::DeviceMemory devD; + gpu::DeviceMemory devE; + // @} // NULL placeholder for unused DeviceMemory parameters gpu::DeviceMemory devNull; @@ -141,39 +147,44 @@ namespace LOFAR * Kernels */ - // int -> float + // Int -> Float conversion IntToFloatKernel::Buffers intToFloatBuffers; std::auto_ptr<IntToFloatKernel> intToFloatKernel; - // first FFT + // First (64 points) FFT FFT_Kernel firstFFT; - // delay compensation + // Delay compensation DelayAndBandPassKernel::Buffers delayCompensationBuffers; std::auto_ptr<DelayAndBandPassKernel> delayCompensationKernel; - // second FFT + // Second (64 points) FFT FFT_Kernel secondFFT; - // bandpass correction + // Bandpass correction and tranpose gpu::DeviceMemory devBandPassCorrectionWeights; - DelayAndBandPassKernel::Buffers correctBandPassBuffers; - std::auto_ptr<DelayAndBandPassKernel> correctBandPassKernel; + BandPassCorrectionKernel::Buffers bandPassCorrectionBuffers; + std::auto_ptr<BandPassCorrectionKernel> bandPassCorrectionKernel; // ***************************************************************** // Objects needed to produce Coherent stokes output // beam former + + const bool outputComplexVoltages; + const bool coherentStokesPPF; + gpu::DeviceMemory devBeamFormerDelays; BeamFormerKernel::Buffers beamFormerBuffers; std::auto_ptr<BeamFormerKernel> beamFormerKernel; + // Transpose BeamFormerTransposeKernel::Buffers transposeBuffers; std::auto_ptr<BeamFormerTransposeKernel> transposeKernel; - // inverse FFT + // inverse (4k points) FFT FFT_Kernel inverseFFT; - // PPF + // Poly-phase filter (FIR + FFT) gpu::DeviceMemory devFilterWeights; gpu::DeviceMemory devFilterHistoryData; FIR_FilterKernel::Buffers firFilterBuffers; @@ -184,32 +195,34 @@ namespace LOFAR CoherentStokesKernel::Buffers coherentStokesBuffers; std::auto_ptr<CoherentStokesKernel> coherentStokesKernel; - // end result - gpu::DeviceMemory &devResult; - // ***************************************************************** // Objects needed to produce incoherent stokes output + + const bool incoherentStokesPPF; + + // Transpose + IncoherentStokesTransposeKernel::Buffers incoherentTransposeBuffers; + std::auto_ptr<IncoherentStokesTransposeKernel> incoherentTranspose; + + // Inverse (4k points) FFT FFT_Kernel incoherentInverseFFT; - //// PPF + // Poly-phase filter (FIR + FFT) gpu::DeviceMemory devIncoherentFilterWeights; gpu::DeviceMemory devIncoherentFilterHistoryData; FIR_FilterKernel::Buffers incoherentFirFilterBuffers; std::auto_ptr<FIR_FilterKernel> incoherentFirFilterKernel; FFT_Kernel incoherentFinalFFT; - //// Incoherent Stokes + // Incoherent Stokes IncoherentStokesKernel::Buffers incoherentStokesBuffers; std::auto_ptr<IncoherentStokesKernel> incoherentStokesKernel; - //output for Incoherent stokes - gpu::DeviceMemory &devIncoherentStokes; - bool coherentBeamformer; // TODO temporary hack to allow typing of subband proc }; } - } +} #endif diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc index a19670dc636714ee00c0a21bef5e6de2a8d99b43..316660401c922fce92913dc402183d2e82bc5370 100644 --- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc +++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.cc @@ -1,4 +1,5 @@ -//# tCorrelatorWorKQueueProcessSb.cc: test CorrelatorSubbandProc::processSubband() +//# tBeamFormerSubbandProcProcessSb: test of Beamformer subband processor. +//# //# Copyright (C) 2013 ASTRON (Netherlands Institute for Radio Astronomy) //# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands //# @@ -29,14 +30,21 @@ #include <GPUProc/SubbandProcs/BeamFormerSubbandProc.h> #include <GPUProc/SubbandProcs/BeamFormerFactories.h> +#include "../fpequals.h" + using namespace std; using namespace LOFAR::Cobalt; using namespace LOFAR::TYPES; +const unsigned nrChannel1 = 64; +const unsigned nrChannel2 = 64; + template<typename T> T inputSignal(size_t t) { - double freq = 1.0 / 2.0; // in samples - double amp = 255.0; + size_t nrBits = sizeof(T) / 2 * 8; + double freq = 1.0 / 4.0; // in samples + // double freq = (2 * 64.0 + 17.0) / 4096.0; // in samples + double amp = (1 << (nrBits - 1)) - 1; double angle = (double)t * 2.0 * M_PI * freq; @@ -63,38 +71,93 @@ int main() { Parset ps("tBeamFormerSubbandProcProcessSb.parset"); - // Create very simple kernel programs, with predictable output. Skip as much as possible. - // Nr of channels/sb from the parset is 1, so the PPF will not even run. - // Parset also has turned of delay compensation and bandpass correction - // (but that kernel will run to convert int to float and to transform the data order). + // Input array sizes + const size_t nrBeams = ps.nrBeams(); + const size_t nrStations = ps.nrStations(); + const size_t nrPolarisations = ps.settings.nrPolarisations; + const size_t maxNrTABsPerSAP = ps.settings.beamFormer.maxNrTABsPerSAP(); + const size_t nrSamplesPerSubband = ps.nrSamplesPerSubband(); + const size_t nrBitsPerSample = ps.settings.nrBitsPerSample; + const size_t nrBytesPerComplexSample = ps.nrBytesPerComplexSample(); + + // We only support 8-bit or 16-bit input samples + ASSERT(nrBitsPerSample == 8 || nrBitsPerSample == 16); + + // TODO: Amplitude is now calculated at two places. Dangerous! + const size_t amplitude = (1 << (nrBitsPerSample - 1)) - 1; + const size_t scaleFactor = nrBitsPerSample == 16 ? 1 : 16; + + LOG_INFO_STR( + "Input info:" << + "\n nrBeams = " << nrBeams << + "\n nrStations = " << nrStations << + "\n nrPolarisations = " << nrPolarisations << + "\n maxNrTABsPerSAP = " << maxNrTABsPerSAP << + "\n nrSamplesPerSubband = " << nrSamplesPerSubband << + "\n nrBitsPerSample = " << nrBitsPerSample << + "\n nrBytesPerComplexSample = " << nrBytesPerComplexSample); + + // Output array sizes + const size_t nrStokes = ps.settings.beamFormer.incoherentSettings.nrStokes; + const size_t nrChannels = + ps.settings.beamFormer.incoherentSettings.nrChannels; + const size_t nrSamples = + ps.settings.beamFormer.incoherentSettings.nrSamples( + ps.settings.nrSamplesPerSubband()); + + LOG_INFO_STR( + "Output info:" << + "\n nrStokes = " << nrStokes << + "\n nrChannels = " << nrChannels << + "\n nrSamples = " << nrSamples << + "\n scaleFactor = " << scaleFactor); + + // Create very simple kernel programs, with predictable output. Skip as much + // as possible. Nr of channels/sb from the parset is 1, so the PPF will not + // even run. Parset also has turned of delay compensation and bandpass + // correction (but that kernel will run to convert int to float and to + // transform the data order). BeamFormerFactories factories(ps); BeamFormerSubbandProc bwq(ps, ctx, factories); - SubbandProcInputData in(ps.nrBeams(), ps.nrStations(), ps.settings.nrPolarisations, - ps.settings.beamFormer.maxNrTABsPerSAP(), - ps.nrSamplesPerSubband(), ps.nrBytesPerComplexSample(), ctx); + SubbandProcInputData in( + nrBeams, nrStations, nrPolarisations, maxNrTABsPerSAP, + nrSamplesPerSubband, nrBytesPerComplexSample, ctx); // Initialize synthetic input to input signal - for (size_t st = 0; st < ps.nrStations(); st++) - for (size_t i = 0; i < ps.nrSamplesPerSubband(); i++) - for (size_t pol = 0; pol < NR_POLARIZATIONS; pol++) + for (size_t st = 0; st < nrStations; st++) + for (size_t i = 0; i < nrSamplesPerSubband; i++) + for (size_t pol = 0; pol < nrPolarisations; pol++) { - if (ps.nrBytesPerComplexSample() == 4) { // 16 bit mode - *(i16complex*)&in.inputSamples[st][i][pol][0] = inputSignal<i16complex>(i); - } else if (ps.nrBytesPerComplexSample() == 2) { // 8 bit mode - *(i8complex*)&in.inputSamples[st][i][pol][0] = inputSignal<i8complex>(i); - } else { - cerr << "Error: number of bits per sample must be 8, or 16" << endl; - exit(1); + switch(nrBitsPerSample) { + case 8: + reinterpret_cast<i8complex&>(in.inputSamples[st][i][pol][0]) = + inputSignal<i8complex>(i); + break; + case 16: + reinterpret_cast<i16complex&>(in.inputSamples[st][i][pol][0]) = + inputSignal<i16complex>(i); + break; + default: + break; } } - // Initialize subbands partitioning administration (struct BlockID). We only do the 1st block of whatever. - in.blockID.block = 0; // Block number: 0 .. inf - in.blockID.globalSubbandIdx = 0; // Subband index in the observation: [0, ps.nrSubbands()) - in.blockID.localSubbandIdx = 0; // Subband index for this pipeline/workqueue: [0, subbandIndices.size()) - in.blockID.subbandProcSubbandIdx = 0; // Subband index for this SubbandProc + // Initialize subbands partitioning administration (struct BlockID). We only + // do the 1st block of whatever. + + // Block number: 0 .. inf + in.blockID.block = 0; + + // Subband index in the observation: [0, ps.nrSubbands()) + in.blockID.globalSubbandIdx = 0; + + // Subband index for this pipeline/workqueue: [0, subbandIndices.size()) + in.blockID.localSubbandIdx = 0; + + // Subband index for this SubbandProc + in.blockID.subbandProcSubbandIdx = 0; // Initialize delays. We skip delay compensation, but init anyway, // so we won't copy uninitialized data to the device. @@ -107,10 +170,7 @@ int main() { for (size_t i = 0; i < in.tabDelays.num_elements(); i++) in.tabDelays.get<float>()[i] = 0.0f; - BeamFormedData out(ps.settings.beamFormer.maxNrTABsPerSAP() * ps.settings.beamFormer.coherentSettings.nrStokes, - ps.settings.beamFormer.coherentSettings.nrChannels, - ps.settings.beamFormer.coherentSettings.nrSamples(ps.settings.nrSamplesPerSubband()), - ctx); + BeamFormedData out(maxNrTABsPerSAP * nrStokes, nrChannels, nrSamples, ctx); for (size_t i = 0; i < out.num_elements(); i++) out.get<float>()[i] = 42.0f; @@ -124,39 +184,42 @@ int main() { cout << "Output: " << endl; // Output verification - // The int2float conversion scales its output to the same amplitude as in 16 bit mode. - // For 8 bit mode, that is a factor 256. - // Since we inserted all (1, 1) vals, for 8 bit mode this means that the correlator - // outputs 256*256. It then sums over nrSamplesPerSb values. -#if 0 - unsigned scale = 1*1; - if (ps.nrBitsPerSample() == 8) - scale = 256*256; -#endif - bool unexpValueFound = false; - for (size_t b = 0; b < ps.settings.beamFormer.maxNrTABsPerSAP() * ps.settings.beamFormer.coherentSettings.nrStokes; b++) - for (size_t t = 0; t < ps.settings.beamFormer.coherentSettings.nrSamples(ps.settings.nrSamplesPerSubband()); t++) - for (size_t c = 0; c < ps.settings.beamFormer.coherentSettings.nrChannels; c++) - { - float v = out[b][t][c]; -// disable output validation until we've verified the beamformer pipeline! -#if 0 - if (v != 4.0f) - { - unexpValueFound = true; - cout << '*'; // indicate error in output - } -#endif - cout << v << " "; - } - cout << endl; - if (unexpValueFound) - { - cerr << "Error: Found unexpected output value(s)" << endl; - return 1; + // We can calculate the expected output values, since we're supplying a + // complex sine/cosine input signal. We only have Stokes-I, so the output + // should be: (nrStation * amp * scaleFactor * nrChannel1 * nrChannel2)^2 + // - amp is set to the maximum possible value for the bit-mode: + // i.e. 127 for 8-bit and 32767 for 16-bit mode + // - scaleFactor is the scaleFactor applied by the IntToFloat kernel. + // It is 16 for 8-bit mode and 1 for 16-bit mode. + // Hence, each output sample should be: + // - for 16-bit input: (2 * 32767 * 1 * 64 * 64)^2 = 72053196058525696 + // - for 8-bit input: (2 * 127 * 16 * 64 * 64)^2 = 1082398867456 + + float outVal; + switch(nrBitsPerSample) { + case 8: + outVal = + nrStations * amplitude * scaleFactor * nrChannel1 * nrChannel2 * + nrStations * amplitude * scaleFactor * nrChannel1 * nrChannel2; + break; + case 16: + outVal = + nrStations * amplitude * scaleFactor * nrChannel1 * nrChannel2 * + nrStations * amplitude * scaleFactor * nrChannel1 * nrChannel2; + break; + default: + break; } - + cout << "outVal = " << outVal << endl; + + for (size_t s = 0; s < nrStokes; s++) + for (size_t t = 0; t < nrSamples; t++) + for (size_t c = 0; c < nrChannels; c++) + ASSERTSTR(fpEquals(out[s][t][c], outVal), + "out[" << s << "][" << t << "][" << c << "] = " << + out[s][t][c] << "; outVal = " << outVal); + return 0; } diff --git a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.parset b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.parset index 36d890a142dd345a2a8f7df6f45179ee1374174d..e64611e615b91e354ae007be39d4d1fd389886ba 100644 --- a/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.parset +++ b/RTCP/Cobalt/GPUProc/test/SubbandProcs/tBeamFormerSubbandProcProcessSb.parset @@ -8,21 +8,30 @@ Observation.VirtualInstrument.stationList = [CS002] Observation.antennaSet = HBA_DUAL Observation.nrBeams = 1 Observation.Beam[0].subbandList = [300] -Observation.Beam[0].nrTiedArrayBeams = 2 +Observation.Beam[0].nrTiedArrayBeams = 1 Observation.Beam[0].TiedArrayBeam[0].angle1 = 0 Observation.Beam[0].TiedArrayBeam[0].angle2 = 0 -Observation.Beam[0].TiedArrayBeam[0].coherent = T -Observation.Beam[0].TiedArrayBeam[1].angle1 = 0 -Observation.Beam[0].TiedArrayBeam[1].angle2 = 0 -Observation.Beam[0].TiedArrayBeam[1].coherent = T +Observation.Beam[0].TiedArrayBeam[0].coherent = F OLAP.CNProc_CoherentStokes.which = I # IQUV OLAP.CNProc_CoherentStokes.timeIntegrationFactor = 1 OLAP.CNProc_CoherentStokes.channelsPerSubband = 1 +OLAP.CNProc_IncoherentStokes.which = I # IQUV +OLAP.CNProc_IncoherentStokes.timeIntegrationFactor = 1 +OLAP.CNProc_IncoherentStokes.channelsPerSubband = 1 Observation.rspBoardList = [0] Observation.rspSlotList = [0] Observation.DataProducts.Output_Beamformed.enabled=true -Observation.DataProducts.Output_Beamformed.filenames=[tab0.raw,tab1.raw] -Observation.DataProducts.Output_Beamformed.locations=[2*:.] +Observation.DataProducts.Output_Beamformed.filenames=[tab0.raw] +Observation.DataProducts.Output_Beamformed.locations=[1*:.] +Cobalt.Kernels.BeamFormerKernel.dumpOutput = true +Cobalt.Kernels.BeamFormerTransposeKernel.dumpOutput = true +Cobalt.Kernels.CoherentStokesKernel.dumpOutput = true +Cobalt.Kernels.DelayAndBandPassKernel.dumpOutput = true +Cobalt.Kernels.FFT_Kernel.dumpOutput = true +Cobalt.Kernels.FIR_FilterKernel.dumpOutput = true +Cobalt.Kernels.IncoherentStokesKernel.dumpOutput = true +Cobalt.Kernels.IncoherentStokesTransposeKernel.dumpOutput = true +Cobalt.Kernels.IntToFloatKernel.dumpOutput = true diff --git a/RTCP/Cobalt/GPUProc/test/cuda/tBandPassCorrection.cc b/RTCP/Cobalt/GPUProc/test/cuda/tBandPassCorrection.cc index 5625c13993b976ba2e6e3b885dbc504151da32d2..126caca50e33f56b5f6d7bd4160c440dcb49b8ad 100644 --- a/RTCP/Cobalt/GPUProc/test/cuda/tBandPassCorrection.cc +++ b/RTCP/Cobalt/GPUProc/test/cuda/tBandPassCorrection.cc @@ -127,6 +127,7 @@ CompileDefinitions getDefaultCompileDefinitions() boost::lexical_cast<string>(NR_POLARIZATIONS); defs["NR_BITS_PER_SAMPLE"] = boost::lexical_cast<string>(NR_BITS_PER_SAMPLE); + defs["DO_BANDPASS_CORRECTION"] = "1"; return defs; }