diff --git a/libtcc/TCCorrelator.cu b/libtcc/TCCorrelator.cu
index 0e65d2cf5e47f6871facff0346d2c61568658b55..f046c9094d14b8d7761b2795d9a08a621c6ee9cc 100644
--- a/libtcc/TCCorrelator.cu
+++ b/libtcc/TCCorrelator.cu
@@ -8,14 +8,15 @@
 
 #include <mma.h>
 
-#define NR_BASELINES		(NR_RECEIVERS * (NR_RECEIVERS + 1) / 2)
-#define ALIGN(A,N)		(((A)+(N)-1)/(N)*(N))
+#define NR_BASELINES		 (NR_RECEIVERS * (NR_RECEIVERS + 1) / 2)
+#define ALIGN(A,N)		 (((A)+(N)-1)/(N)*(N))
 
-#define NR_TIMES_PER_BLOCK	(128 / (NR_BITS))
-#define NR_RECEIVERS_PER_TCM_X	((NR_BITS) == 4 ? 2 : 4)
-#define NR_RECEIVERS_PER_TCM_Y	8
+#define NR_TIMES_PER_BLOCK	 (128 / (NR_BITS))
+#define NR_RECEIVERS_PER_TCM_X	 ((NR_BITS) == 4 ? 2 : 4)
+#define NR_RECEIVERS_PER_TCM_Y	 8
+#define NR_RECEIVERS_PER_BLOCK_X (NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK)
 
-#define COMPLEX			2
+#define COMPLEX			 2
 
 #if __CUDA_ARCH__ < (NR_BITS == 4 ? 730 : NR_BITS == 8 ? 720 : NR_BITS == 16 ? 700 : 0)
 #error this architecture has no suitable tensor cores
@@ -267,12 +268,12 @@ template <typename T> __device__ inline void storeVisibility(Visibilities visibi
 
 template <typename T> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned recvY, unsigned recvX, unsigned tcY, unsigned tcX, unsigned polY, unsigned polX, bool skipCheckY, bool skipCheckX, T sumR, T sumI)
 {
-  if ((skipCheckX || recvX + tcX <= recvY + tcY) && (skipCheckY || recvY + tcY < NR_RECEIVERS))
-    storeVisibility(visibilities, channel, baseline + tcY * recvY + tcY * (tcY + 1) / 2 + tcX, polY, polX, make_complex(sumR, sumI));
+  if ((skipCheckY || recvY + tcY <= recvX + tcX) && (skipCheckX || recvX + tcX < NR_RECEIVERS))
+    storeVisibility(visibilities, channel, baseline + tcX * recvX + tcX * (tcX + 1) / 2 + tcY, polY, polX, make_complex(sumR, sumI));
 }
 
 
-__device__ inline void storeVisibilities(Visibilities visibilities, unsigned channel, unsigned firstReceiverY, unsigned firstReceiverX, unsigned recvYoffset, unsigned recvXoffset, unsigned y, unsigned x, bool skipCheckY, bool skipCheckX, const Sum &sum, ScratchSpace scratchSpace[], unsigned warp)
+__device__ inline void storeVisibilities(Visibilities visibilities, unsigned channel, unsigned firstReceiverY, unsigned firstReceiverX, unsigned y, unsigned x, bool skipCheckY, bool skipCheckX, const Sum &sum, ScratchSpace scratchSpace[], unsigned warp)
 {
 #if defined PORTABLE
  store_matrix_sync(&scratchSpace[warp][0][0][0][0].x, sum, NR_RECEIVERS_PER_TCM_X * NR_POLARIZATIONS * COMPLEX, mem_row_major);
@@ -297,11 +298,11 @@ __device__ inline void storeVisibilities(Visibilities visibilities, unsigned cha
   unsigned _x       = threadIdx.x & 3;
 #endif
 
-  unsigned recvY    = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + _y;
-  unsigned recvX    = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + _x;
-  unsigned baseline = (recvY * (recvY + 1) / 2) + recvX;
+  unsigned recvY    = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + _y;
+  unsigned recvX    = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + _x;
+  unsigned baseline = (recvX * (recvX + 1) / 2) + recvY;
 
-  if ((skipCheckX || recvX <= recvY) && (skipCheckY || recvY < NR_RECEIVERS))
+  if ((skipCheckY || recvY <= recvX) && (skipCheckX || recvX < NR_RECEIVERS))
 #if NR_BITS == 4
     for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++)
       visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX];
@@ -312,18 +313,18 @@ __device__ inline void storeVisibilities(Visibilities visibilities, unsigned cha
 #endif
 #else
 #if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16)
-  unsigned recvY    = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4);
-  unsigned recvX    = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2);
+  unsigned recvY    = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4);
+  unsigned recvX    = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2);
   unsigned polY     = threadIdx.x & 1;
   unsigned polX     = (threadIdx.x >> 1) & 1;
 #elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860
-  unsigned recvY    = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 3);
-  unsigned recvX    = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 1) & 1);
+  unsigned recvY    = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 3);
+  unsigned recvX    = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 1) & 1);
   unsigned polY     = (threadIdx.x >> 2) & 1;
   unsigned polX     = threadIdx.x & 1;
 #endif
 
-  unsigned baseline = (recvY * (recvY + 1) / 2) + recvX;
+  unsigned baseline = (recvX * (recvX + 1) / 2) + recvY;
 
 #if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16)
   storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]);
@@ -361,9 +362,9 @@ template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities vi
 
   const uchar2 offsets[] = {
     make_uchar2( 0,  0),
-    make_uchar2( 0, 16),
-    make_uchar2( 0, 40),
-    make_uchar2(24, 40),
+    make_uchar2(16,  0),
+    make_uchar2(40,  0),
+    make_uchar2(40, 24),
   };
 
   unsigned recvXoffset = offsets[warp].x;
@@ -444,7 +445,7 @@ template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities vi
 	  for (unsigned y = 0; y < 2; y ++) {
 	    load_matrix_sync(aFrag, &bSamples[buffer][/*recvYoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS);
 
-	    for (unsigned x = 0; x < 8 * (y + 1) / NR_RECEIVERS_PER_TCM_X; x ++, i ++)
+	    for (unsigned x = 8 * y / NR_RECEIVERS_PER_TCM_X; x < 16 / NR_RECEIVERS_PER_TCM_X; x ++, i ++)
 	      mma_sync(sum[i], aFrag, bFrag[x], sum[i]);
 	  }
 	}
@@ -459,21 +460,19 @@ template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities vi
   if (warp != 0)
     for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++)
       for (unsigned x = 0; x < nrFragmentsX; x ++, i ++)
-	storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, recvYoffset, recvXoffset, y, x, fullTriangle, y > 0 || x < (NR_BITS == 4 ? 8 : 4), sum[i], scratchSpace, warp);
+	storeVisibilities(visibilities, channel, firstReceiver + recvYoffset, firstReceiver + recvXoffset, y, x, y < 2 || x > (NR_BITS == 4 ? 4 : 2), fullTriangle, sum[i], scratchSpace, warp);
   else
     for (unsigned z = 0, i = 0; z < 3; z ++)
       for (unsigned y = 0; y < 2; y ++)
-	for (unsigned x = 0; x < (NR_BITS == 4 ? 4 : 2) * (y + 1); x ++, i ++)
-	  storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, 24 * z, 24 * z, y, x, fullTriangle, x < (NR_BITS == 4 ? 4 : 2) * y, sum[i], scratchSpace, warp);
+	for (unsigned x = 8 * y / NR_RECEIVERS_PER_TCM_X; x < 16 / NR_RECEIVERS_PER_TCM_X; x ++, i ++)
+	  storeVisibilities(visibilities, channel, firstReceiver + 24 * z, firstReceiver + 24 * z, y, x, (y + 1) * NR_RECEIVERS_PER_TCM_Y <= x * NR_RECEIVERS_PER_TCM_X, fullTriangle, sum[i], scratchSpace, warp);
 }
 
 #endif
 
 
-template <unsigned nrFragmentsY, bool skipLoadYcheck, bool skipLoadXcheck, bool skipStoreYcheck, bool skipStoreXcheck> __device__ void doCorrelateRectangle(Visibilities visibilities, const Samples samples, unsigned firstReceiverY, unsigned firstReceiverX, SharedData<>::Asamples &aSamples, SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS])
+template <unsigned nrFragmentsY, unsigned nrFragmentsX, bool skipLoadYcheck, bool skipLoadXcheck, bool skipStoreYcheck, bool skipStoreXcheck> __device__ void doCorrelateRectangle(Visibilities visibilities, const Samples samples, unsigned firstReceiverY, unsigned firstReceiverX, SharedData<>::Asamples &aSamples, SharedData<NR_RECEIVERS_PER_BLOCK_X>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS])
 {
-  const unsigned nrFragmentsX = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_X / 2 / (NR_RECEIVERS_PER_BLOCK == 64 ? 2 : 1);
-
   Sum sum[nrFragmentsY][nrFragmentsX];
 
   for (unsigned y = 0; y < nrFragmentsY; y ++)
@@ -611,7 +610,7 @@ template <unsigned nrFragmentsY, bool skipLoadYcheck, bool skipLoadXcheck, bool
 
   for (unsigned y = 0; y < nrFragmentsY; y ++)
     for (unsigned x = 0; x < nrFragmentsX; x ++)
-      storeVisibilities(visibilities, channel, firstReceiverY, firstReceiverX, recvYoffset, recvXoffset, y, x, skipStoreYcheck, skipStoreXcheck, sum[y][x], scratchSpace, tid / warpSize);
+      storeVisibilities(visibilities, channel, firstReceiverY + recvYoffset, firstReceiverX + recvXoffset, y, x, skipStoreYcheck, skipStoreXcheck, sum[y][x], scratchSpace, tid / warpSize);
 }
 
 
@@ -619,25 +618,31 @@ extern "C" __global__
 __launch_bounds__(NR_WARPS * 32, NR_RECEIVERS_PER_BLOCK == 32 ? 4 : 2)
 void correlate(Visibilities visibilities, const Samples samples)
 {
-  const unsigned nrFragmentsY = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_Y / 2;
+  constexpr unsigned nrFragmentsX = NR_RECEIVERS_PER_BLOCK_X / NR_RECEIVERS_PER_TCM_X / 2;
+  constexpr unsigned nrFragmentsY = NR_RECEIVERS_PER_BLOCK   / NR_RECEIVERS_PER_TCM_Y / 2;
 
   unsigned block = blockIdx.x;
 
 #if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48
-  unsigned blockY = (unsigned) (sqrtf(8 * block + 1) - .99999f) / 2;
-  unsigned blockX = block - blockY * (blockY + 1) / 2;
+  unsigned blockX = (unsigned) (sqrtf(8 * block + 1) - .99999f) / 2;
+  unsigned blockY = block - blockX * (blockX + 1) / 2;
+  unsigned firstReceiverY = blockY * NR_RECEIVERS_PER_BLOCK;
   unsigned firstReceiverX = blockX * NR_RECEIVERS_PER_BLOCK;
 #elif NR_RECEIVERS_PER_BLOCK == 64
-  unsigned blockY = (unsigned) sqrtf(block);
-  unsigned blockX = block - blockY * blockY;
-  unsigned firstReceiverX = blockX * (NR_RECEIVERS_PER_BLOCK / 2);
+  unsigned blockX = (unsigned) sqrtf(block);
+  unsigned blockY = block - blockX * blockX;
+  unsigned firstReceiverY = blockY / 2 * NR_RECEIVERS_PER_BLOCK;
+  //unsigned firstReceiverX = (2 * blockX + blockY % 2) * (NR_RECEIVERS_PER_BLOCK / 2);
+  unsigned firstReceiverX = blockX * NR_RECEIVERS_PER_BLOCK + (blockY % 2) * NR_RECEIVERS_PER_BLOCK / 2;
+
+  if (firstReceiverX >= NR_RECEIVERS)
+    return;
 #endif
-  unsigned firstReceiverY = blockY * NR_RECEIVERS_PER_BLOCK;
 
   union shared {
     struct {
       SharedData<>::Asamples aSamples;
-      SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples bSamples;
+      SharedData<NR_RECEIVERS_PER_BLOCK_X>::Bsamples bSamples;
     } rectangle;
     struct {
       SharedData<>::Bsamples samples;
@@ -653,19 +658,19 @@ void correlate(Visibilities visibilities, const Samples samples)
 
   if (firstReceiverX == firstReceiverY)
 #if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48
-    doCorrelateRectangle<nrFragmentsY, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, false>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace);
+    doCorrelateRectangle<nrFragmentsY, nrFragmentsX, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, false, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); // TODO: smaller nrFragments[XY]
 #elif NR_RECEIVERS_PER_BLOCK == 64
     if (NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 && (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverX >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK))
       doCorrelateTriangle<false>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace);
     else
       doCorrelateTriangle<true>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace);
 #endif
-#if NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0
-  else if (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverY >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK)
-    doCorrelateRectangle<(NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK + 2 * NR_RECEIVERS_PER_TCM_Y - 1) / NR_RECEIVERS_PER_TCM_Y / 2, false, true, NR_RECEIVERS % (2 * NR_RECEIVERS_PER_TCM_Y) == 0, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace);
+#if NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK_X != 0
+  else if (firstReceiverX >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK_X * NR_RECEIVERS_PER_BLOCK_X)
+    doCorrelateRectangle<nrFragmentsY, (NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK_X + 2 * NR_RECEIVERS_PER_TCM_X - 1) / (2 * NR_RECEIVERS_PER_TCM_X), true, false, true, NR_RECEIVERS % (2 * NR_RECEIVERS_PER_TCM_X) == 0>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace);
 #endif
   else
-    doCorrelateRectangle<nrFragmentsY, true, true, true, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace);
+    doCorrelateRectangle<nrFragmentsY, nrFragmentsX, true, true, true, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace);
 
 #if 0
   if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
diff --git a/test/CorrelatorTest/CorrelatorTest.cc b/test/CorrelatorTest/CorrelatorTest.cc
index 33b33d01381d53f22fbbaee48bce2f47b73d6211..728d049c638e4bf86e49a20aec8fdb247777c977 100644
--- a/test/CorrelatorTest/CorrelatorTest.cc
+++ b/test/CorrelatorTest/CorrelatorTest.cc
@@ -186,25 +186,25 @@ template<typename SampleType, typename VisibilityType> void CorrelatorTest::veri
       for (unsigned major_time = 0; major_time < options.nrSamplesPerChannel / options.nrTimesPerBlock; major_time ++) {
 	multi_array::array_ref<SampleType, 3> ref = samples[channel][major_time];
 
-	for (unsigned recv1 = 0, baseline = 0; recv1 < options.nrReceivers; recv1 ++)
-	  for (unsigned recv0 = 0; recv0 <= recv1; recv0 ++, baseline ++)
+	for (unsigned recvX = 0, baseline = 0; recvX < options.nrReceivers; recvX ++)
+	  for (unsigned recvY = 0; recvY <= recvX; recvY ++, baseline ++)
 	    for (unsigned minor_time = 0; minor_time < options.nrTimesPerBlock; minor_time ++)
-	      for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++)
-		for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++) {
-		  SampleType sample0 = ref[recv0][pol0][minor_time];
-		  SampleType sample1 = ref[recv1][pol1][minor_time];
+	      for (unsigned polY = 0; polY < options.nrPolarizations; polY ++)
+		for (unsigned polX = 0; polX < options.nrPolarizations; polX ++) {
+		  SampleType sampleY = ref[recvY][polY][minor_time];
+		  SampleType sampleX = ref[recvX][polX][minor_time];
 
-		  sum[baseline][pol1][pol0] += VisibilityType(sample1.real(), sample1.imag()) * conj(VisibilityType(sample0.real(), sample0.imag()));
+		  sum[baseline][polY][polX] += VisibilityType(sampleY.real(), sampleY.imag()) * conj(VisibilityType(sampleX.real(), sampleX.imag()));
 		}
       }
 
       for (unsigned baseline = 0; baseline < options.nrBaselines(); baseline ++)
-	for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++)
-	  for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++)
-	    if (!approximates(visibilities[channel][baseline][pol1][pol0], sum[baseline][pol1][pol0]) && ++ count < 100)
+	for (unsigned polY = 0; polY < options.nrPolarizations; polY ++)
+	  for (unsigned polX = 0; polX < options.nrPolarizations; polX ++)
+	    if (!approximates(visibilities[channel][baseline][polY][polX], sum[baseline][polY][polX]) && ++ count < 100)
 #pragma omp critical (cout)
 	      ep([&] () {
-		std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << std::endl;
+		std::cout << "visibilities[" << channel << "][" << baseline << "][" << polY << "][" << polX << "], expected " << sum[baseline][polY][polX] << ", got " << visibilities[channel][baseline][polY][polX] << std::endl;
 	      });
     });
 
@@ -212,11 +212,11 @@ template<typename SampleType, typename VisibilityType> void CorrelatorTest::veri
 #else
   for (unsigned channel = 0; channel < options.nrChannels; channel ++)
     for (unsigned baseline = 0; baseline < options.nrBaselines(); baseline ++)
-      for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++)
-	for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++)
-	    if (visibilities[channel][baseline][pol0][pol1] != (VisibilityType)(0, 0))
+      for (unsigned polY = 0; polY < options.nrPolarizations; polY ++)
+	for (unsigned polX = 0; polX < options.nrPolarizations; polX ++)
+	    if (visibilities[channel][baseline][polY][polX] != (VisibilityType)(0, 0))
 	      if (count ++ < 10)
-		std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol0 << "][" << pol1 << "] = " << visibilities[channel][baseline][pol0][pol1] << std::endl;
+		std::cout << "visibilities[" << channel << "][" << baseline << "][" << polY << "][" << polX << "] = " << visibilities[channel][baseline][polY][polX] << std::endl;
 #endif
 }
 
diff --git a/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc b/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc
index 9099af31f97d882e73947e7f8528feefa4fdeb78..c427c0e3a7d776465d9d0451c5e9b2c832d3a87e 100644
--- a/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc
+++ b/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc
@@ -22,6 +22,10 @@
 #include <omp.h>
 #include <sys/types.h>
 
+#if NR_BITS == 16
+#include <cuda_fp16.h> // we do not use code, only the __half type
+#endif
+
 #define __CL_ENABLE_EXCEPTIONS
 #include <CL/cl.hpp>
 #include <CL/cl_ext.h>
@@ -348,32 +352,40 @@ void checkTestPattern(cl::CommandQueue &queue, cl::Buffer &visibilitiesBuffer, c
     memset(sum, 0, sizeof sum);
 
     for (unsigned major_time = 0; major_time < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; major_time ++)
-      for (unsigned recv1 = 0, baseline = 0; recv1 < NR_RECEIVERS; recv1 ++)
-        for (unsigned recv0 = 0; recv0 <= recv1; recv0 ++, baseline ++)
+      for (unsigned recvX = 0, baseline = 0; recvX < NR_RECEIVERS; recvX ++)
+        for (unsigned recvY = 0; recvY <= recvX; recvY ++, baseline ++)
           for (unsigned minor_time = 0; minor_time < NR_TIMES_PER_BLOCK; minor_time ++)
-            for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++)
-              for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++)
+            for (unsigned polY = 0; polY < NR_POLARIZATIONS; polY ++)
+              for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++) {
+		Sample sampleY = samples[channel][major_time][recvY][polY][minor_time];
+		Sample sampleX = samples[channel][major_time][recvX][polX][minor_time];
+
+		sum[baseline][polY][polX] += Visibility(sampleY.real(), sampleY.imag()) * conj(Visibility(sampleX.real(), sampleX.imag()));
+#if 0
 #if NR_BITS == 4 || NR_BITS == 8
-                sum[baseline][pol1][pol0] += (std::complex<int32_t>) samples[channel][major_time][recv1][pol1][minor_time] * conj((std::complex<int32_t>) samples[channel][major_time][recv0][pol0][minor_time]);
+                sum[baseline][polY][polX] += (std::complex<int32_t>) samples[channel][major_time][recvY][polY][minor_time] * conj((std::complex<int32_t>) samples[channel][major_time][recvX][polX][minor_time]);
 #elif NR_BITS == 16
-                sum[baseline][pol1][pol0] += samples[channel][major_time][recv1][pol1][minor_time] * conj(samples[channel][major_time][recv0][pol0][minor_time]);
+                //sum[baseline][polY][polX] += (std::complex<float>) samples[channel][major_time][recvY][polY][minor_time] * conj((std::complex<float>) samples[channel][major_time][recvX][polX][minor_time]);
+                sum[baseline][polY][polX] += (std::complex<float>) samples[channel][major_time][recvY][polY][minor_time] * conj((std::complex<float>) samples[channel][major_time][recvX][polX][minor_time]);
+#endif
 #endif
+	      }
 
     for (unsigned baseline = 0; baseline < NR_BASELINES; baseline ++)
-      for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++)
-        for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++)
+      for (unsigned polY = 0; polY < NR_POLARIZATIONS; polY ++)
+        for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++)
 #if NR_BITS == 4 || NR_BITS == 8
-          if (visibilities[channel][baseline][pol1][pol0] != sum[baseline][pol1][pol0] && ++ count < 10)
+          if (visibilities[channel][baseline][polY][polX] != sum[baseline][polY][polX] && ++ count < 10)
 #pragma omp critical (cout)
-            std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << std::endl;
+            std::cout << "visibilities[" << channel << "][" << baseline << "][" << polY << "][" << polX << "], expected " << sum[baseline][polY][polX] << ", got " << visibilities[channel][baseline][polY][polX] << std::endl;
 #elif NR_BITS == 16
-          float absolute = abs(visibilities[channel][baseline][pol1][pol0] - sum[baseline][pol1][pol0]);
-          float relative = abs(visibilities[channel][baseline][pol1][pol0] / sum[baseline][pol1][pol0]);
-                                                                        195,1         83%
-
+	  {
+	    float absolute = abs(visibilities[channel][baseline][polY][polX] - sum[baseline][polY][polX]);
+	    float relative = abs(visibilities[channel][baseline][polY][polX] / sum[baseline][polY][polX]);
 
-          if ((relative < .999 || relative > 1.001) && absolute > .0001 * NR_SAMPLES_PER_CHANNEL && ++ count < 10)
-              std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << ", relative = " << relative << ", absolute = " << absolute << std::endl;
+	    if ((relative < .999 || relative > 1.001) && absolute > .0001 * NR_SAMPLES_PER_CHANNEL && ++ count < 10)
+		std::cout << "visibilities[" << channel << "][" << baseline << "][" << polY << "][" << polX << "], expected " << sum[baseline][polY][polX] << ", got " << visibilities[channel][baseline][polY][polX] << ", relative = " << relative << ", absolute = " << absolute << std::endl;
+	  }
 #endif
     delete &sum;
   }
@@ -384,16 +396,16 @@ void checkTestPattern(cl::CommandQueue &queue, cl::Buffer &visibilitiesBuffer, c
 #else
   for (unsigned channel = 0; channel < NR_CHANNELS; channel ++)
     for (unsigned baseline = 0; baseline < NR_BASELINES; baseline ++)
-      for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++)
-        for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++)
+      for (unsigned polY = 0; polY < NR_POLARIZATIONS; polY ++)
+        for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++)
 #if NR_BITS == 4 || NR_BITS == 8
-            if (visibilities[channel][baseline][pol0][pol1] != std::complex<int32_t>(0, 0))
+            if (visibilities[channel][baseline][polY][polX] != std::complex<int32_t>(0, 0))
 #elif NR_BITS == 16
-            if (visibilities[channel][baseline][pol0][pol1] != std::complex<float>(0.0f, 0.0f))
+            if (visibilities[channel][baseline][polY][polX] != std::complex<float>(0.0f, 0.0f))
 #endif
               if (count ++ < 10)
 #pragma omp critical (cout)
-                std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol0 << "][" << pol1 << "] = " << visibilities[channel][baseline][pol0][pol1] << std::endl;
+                std::cout << "visibilities[" << channel << "][" << baseline << "][" << polY << "][" << polX << "] = " << visibilities[channel][baseline][polY][polX] << std::endl;
 #endif
 
   queue.enqueueUnmapMemObject(visibilitiesBuffer, visibilities);
diff --git a/test/SimpleExample/SimpleExample.cu b/test/SimpleExample/SimpleExample.cu
index 4fa52ddd281d7483c89031dac05dbe1b7a669783..ba862d35587c462431b81a10d61f097cd49c826b 100644
--- a/test/SimpleExample/SimpleExample.cu
+++ b/test/SimpleExample/SimpleExample.cu
@@ -66,7 +66,7 @@ int main()
     correlator.launchAsync((CUstream) stream, (CUdeviceptr) visibilities, (CUdeviceptr) samples);
     checkCudaCall(cudaDeviceSynchronize());
 
-    std::cout << ((*visibilities)[160][87745][0][0] == Visibility(23, -2) ? "success" : "failed") << std::endl;
+    std::cout << ((*visibilities)[160][87745][0][0] == Visibility(23, 2) ? "success" : "failed") << std::endl;
 
     checkCudaCall(cudaFree(visibilities));
     checkCudaCall(cudaFree(samples));