diff --git a/libtcc/TCCorrelator.cu b/libtcc/TCCorrelator.cu
index 630907969ff854b21117eaa8da6d3db478409d6d..0e65d2cf5e47f6871facff0346d2c61568658b55 100644
--- a/libtcc/TCCorrelator.cu
+++ b/libtcc/TCCorrelator.cu
@@ -194,23 +194,23 @@ template <typename T> struct FetchData
   __device__ void load(const Samples samples, unsigned channel, unsigned time, unsigned firstReceiver, bool skipLoadCheck = NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0)
   {
     if (skipLoadCheck || firstReceiver + loadRecv < NR_RECEIVERS)
-      //data = * (T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime];
-      memcpy(&data, &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], sizeof(T));
+      data = * (T *) &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime];
+      //memcpy(&data, &samples[channel][time][firstReceiver + loadRecv][loadPol][loadTime], sizeof(T));
   }
 
   template <typename SharedData> __device__ void storeA(SharedData samples) const
   {
-    //* ((T *) &samples[loadRecv][loadPol][loadTime][0]) = data;
-    memcpy(&samples[loadRecv][loadPol][loadTime][0], &data, sizeof(T));
+    * ((T *) &samples[loadRecv][loadPol][loadTime][0]) = data;
+    //memcpy(&samples[loadRecv][loadPol][loadTime][0], &data, sizeof(T));
   }
 
   template <typename SharedData> __device__ void storeB(SharedData samples) const
   {
-    //* ((T *) &samples[loadRecv][loadPol][0][loadTime][0]) = data;
-    //* ((T *) &samples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(data);
-    T tmp = conj_perm(data);
-    memcpy(&samples[loadRecv][loadPol][0][loadTime][0], &data, sizeof(T));
-    memcpy(&samples[loadRecv][loadPol][1][loadTime][0], &tmp, sizeof(T));
+    * ((T *) &samples[loadRecv][loadPol][0][loadTime][0]) = data;
+    * ((T *) &samples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(data);
+    //T tmp = conj_perm(data);
+    //memcpy(&samples[loadRecv][loadPol][0][loadTime][0], &data, sizeof(T));
+    //memcpy(&samples[loadRecv][loadPol][1][loadTime][0], &tmp, sizeof(T));
   }
 
 #if defined ASYNC_COPIES
@@ -228,11 +228,11 @@ template <typename T> struct FetchData
 
   template<typename Bsamples> __device__ void fixB(Bsamples bSamples)
   {
-    //* ((T *) &bSamples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(* ((T *) &bSamples[loadRecv][loadPol][0][loadTime][0]));
-    T tmp;
-    memcpy(&tmp, &bSamples[loadRecv][loadPol][0][loadTime][0], sizeof(T));
-    tmp = conj_perm(tmp);
-    memcpy(&bSamples[loadRecv][loadPol][1][loadTime][0], &tmp, sizeof(T));
+    * ((T *) &bSamples[loadRecv][loadPol][1][loadTime][0]) = conj_perm(* ((T *) &bSamples[loadRecv][loadPol][0][loadTime][0]));
+    //T tmp;
+    //memcpy(&tmp, &bSamples[loadRecv][loadPol][0][loadTime][0], sizeof(T));
+    //tmp = conj_perm(tmp);
+    //memcpy(&bSamples[loadRecv][loadPol][1][loadTime][0], &tmp, sizeof(T));
   }
 #endif
 
@@ -438,13 +438,13 @@ template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities vi
 	}
       } else {
 	for (unsigned z = 0, i = 0; z < 3; z ++) {
-	  for (unsigned x = 0; x < nrFragmentsX; x ++)
+	  for (unsigned x = 0; x < 16 / NR_RECEIVERS_PER_TCM_X; x ++)
 	    load_matrix_sync(bFrag[x], &bSamples[buffer][/*recvXoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_X * x][0][0][minorTime][0], sizeof(bSamples[0][0][0][0]) * 8 / NR_BITS);
 
 	  for (unsigned y = 0; y < 2; y ++) {
 	    load_matrix_sync(aFrag, &bSamples[buffer][/*recvYoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS);
 
-	    for (unsigned x = 0; x < (NR_BITS == 4 ? 4 : 2) * (y + 1); x ++, i ++)
+	    for (unsigned x = 0; x < 8 * (y + 1) / NR_RECEIVERS_PER_TCM_X; x ++, i ++)
 	      mma_sync(sum[i], aFrag, bFrag[x], sum[i]);
 	  }
 	}
@@ -666,4 +666,13 @@ void correlate(Visibilities visibilities, const Samples samples)
 #endif
   else
     doCorrelateRectangle<nrFragmentsY, true, true, true, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace);
+
+#if 0
+  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
+    //((uint64_t *) visibilities)[blockIdx. y * 81 + blockIdx.x] = clock64();
+    unsigned sm;
+    asm("mov.u32 %0, %smid;" : "=r"(sm) );
+    printf("block %u channel %u sm %u at %llu\n", blockIdx.x, blockIdx.y, sm, clock64());
+  }
+#endif
 }