From cc8174b5f955a0411a9e14b4f1a9e220703db184 Mon Sep 17 00:00:00 2001 From: Jan David Mol <mol@astron.nl> Date: Mon, 9 Sep 2013 16:17:42 +0000 Subject: [PATCH] Task #2669: Use 2x2 correlator kernel for better performance, and cleaned up global_defines a bit --- RTCP/Cobalt/GPUProc/src/cuda/Kernels/CorrelatorKernel.cc | 9 ++++++--- RTCP/Cobalt/GPUProc/src/global_defines.cc | 3 --- RTCP/Cobalt/GPUProc/src/global_defines.h | 4 ---- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/CorrelatorKernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/CorrelatorKernel.cc index b4bb8ccf782..0679e7adc4e 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/CorrelatorKernel.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/CorrelatorKernel.cc @@ -32,6 +32,10 @@ #include <GPUProc/global_defines.h> +// For Cobalt (= up to 80 antenna fields), the 2x2 kernel gives the best +// performance. +#define USE_2X2 + namespace LOFAR { namespace Cobalt @@ -56,8 +60,7 @@ namespace LOFAR setArg(0, buffers.output); setArg(1, buffers.input); - size_t maxNrThreads, preferredMultiple; - maxNrThreads = getAttribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK); + size_t preferredMultiple; gpu::Platform pf; if (pf.getName() == "AMD Accelerated Parallel Processing") { @@ -80,7 +83,7 @@ namespace LOFAR # else unsigned nrBlocks = nrBaselines; # endif - unsigned nrPasses = (nrBlocks + maxNrThreads - 1) / maxNrThreads; + unsigned nrPasses = (nrBlocks + maxThreadsPerBlock - 1) / maxThreadsPerBlock; unsigned nrThreads = (nrBlocks + nrPasses - 1) / nrPasses; nrThreads = (nrThreads + preferredMultiple - 1) / preferredMultiple * preferredMultiple; diff --git a/RTCP/Cobalt/GPUProc/src/global_defines.cc b/RTCP/Cobalt/GPUProc/src/global_defines.cc index c26aab7e495..922ddf5348d 100644 --- a/RTCP/Cobalt/GPUProc/src/global_defines.cc +++ b/RTCP/Cobalt/GPUProc/src/global_defines.cc @@ -34,9 +34,6 @@ namespace LOFAR { bool profiling = false; bool gpuProfiling = true; - const char *str = getenv("NR_GPUS"); - unsigned nrGPUs = str ? atoi(str) : 1; - inline void set_affinity(unsigned device) { diff --git a/RTCP/Cobalt/GPUProc/src/global_defines.h b/RTCP/Cobalt/GPUProc/src/global_defines.h index 94f61113bae..ff2d2f87e79 100644 --- a/RTCP/Cobalt/GPUProc/src/global_defines.h +++ b/RTCP/Cobalt/GPUProc/src/global_defines.h @@ -22,11 +22,8 @@ #define LOFAR_GPUPROC_GLOBAL_DEFINES_H #define NR_STATION_FILTER_TAPS 16 -#undef USE_NEW_CORRELATOR #define NR_POLARIZATIONS 2 // TODO: get the nr of pol symbol from an LCS/Common header and/or from CoInterface/Config.h (if that isn't a dup too) #define NR_TAPS 16 -#undef USE_2X2 -#undef USE_TEST_DATA #undef USE_B7015 namespace LOFAR @@ -35,7 +32,6 @@ namespace LOFAR { extern bool profiling; extern bool gpuProfiling; - extern unsigned nrGPUs; void set_affinity(unsigned device); } -- GitLab