From cc8174b5f955a0411a9e14b4f1a9e220703db184 Mon Sep 17 00:00:00 2001
From: Jan David Mol <mol@astron.nl>
Date: Mon, 9 Sep 2013 16:17:42 +0000
Subject: [PATCH] Task #2669: Use 2x2 correlator kernel for better performance,
 and cleaned up global_defines a bit

---
 RTCP/Cobalt/GPUProc/src/cuda/Kernels/CorrelatorKernel.cc | 9 ++++++---
 RTCP/Cobalt/GPUProc/src/global_defines.cc                | 3 ---
 RTCP/Cobalt/GPUProc/src/global_defines.h                 | 4 ----
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/CorrelatorKernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/CorrelatorKernel.cc
index b4bb8ccf782..0679e7adc4e 100644
--- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/CorrelatorKernel.cc
+++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/CorrelatorKernel.cc
@@ -32,6 +32,10 @@
 
 #include <GPUProc/global_defines.h>
 
+// For Cobalt (= up to 80 antenna fields), the 2x2 kernel gives the best
+// performance.
+#define USE_2X2
+
 namespace LOFAR
 {
   namespace Cobalt
@@ -56,8 +60,7 @@ namespace LOFAR
       setArg(0, buffers.output);
       setArg(1, buffers.input);
 
-      size_t maxNrThreads, preferredMultiple;
-      maxNrThreads = getAttribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
+      size_t preferredMultiple;
 
       gpu::Platform pf;
       if (pf.getName() == "AMD Accelerated Parallel Processing") {
@@ -80,7 +83,7 @@ namespace LOFAR
 # else
       unsigned nrBlocks = nrBaselines;
 # endif
-      unsigned nrPasses = (nrBlocks + maxNrThreads - 1) / maxNrThreads;
+      unsigned nrPasses = (nrBlocks + maxThreadsPerBlock - 1) / maxThreadsPerBlock;
       unsigned nrThreads = (nrBlocks + nrPasses - 1) / nrPasses;
       nrThreads = (nrThreads + preferredMultiple - 1) / preferredMultiple * preferredMultiple;
 
diff --git a/RTCP/Cobalt/GPUProc/src/global_defines.cc b/RTCP/Cobalt/GPUProc/src/global_defines.cc
index c26aab7e495..922ddf5348d 100644
--- a/RTCP/Cobalt/GPUProc/src/global_defines.cc
+++ b/RTCP/Cobalt/GPUProc/src/global_defines.cc
@@ -34,9 +34,6 @@ namespace LOFAR
   {
     bool profiling = false;
     bool gpuProfiling = true;
-    const char *str = getenv("NR_GPUS");
-    unsigned nrGPUs = str ? atoi(str) : 1;
-
 
     inline void set_affinity(unsigned device)
     {
diff --git a/RTCP/Cobalt/GPUProc/src/global_defines.h b/RTCP/Cobalt/GPUProc/src/global_defines.h
index 94f61113bae..ff2d2f87e79 100644
--- a/RTCP/Cobalt/GPUProc/src/global_defines.h
+++ b/RTCP/Cobalt/GPUProc/src/global_defines.h
@@ -22,11 +22,8 @@
 #define LOFAR_GPUPROC_GLOBAL_DEFINES_H
 
 #define NR_STATION_FILTER_TAPS  16
-#undef USE_NEW_CORRELATOR
 #define NR_POLARIZATIONS         2 // TODO: get the nr of pol symbol from an LCS/Common header and/or from CoInterface/Config.h (if that isn't a dup too)
 #define NR_TAPS                 16
-#undef USE_2X2
-#undef USE_TEST_DATA
 #undef USE_B7015
 
 namespace LOFAR
@@ -35,7 +32,6 @@ namespace LOFAR
   {
     extern bool profiling;
     extern bool gpuProfiling;
-    extern unsigned nrGPUs;
 
     void set_affinity(unsigned device);
   }
-- 
GitLab