Implement direct write of visibilities from registers on AMD GPUs

f72b3bf2 · Leon Oostrum · d752471c · f72b3bf2
Commit f72b3bf2 authored 9 months ago by Leon Oostrum
--- a/libtcc/kernel/TCCorrelator.cu
+++ b/libtcc/kernel/TCCorrelator.cu
@@ -25,6 +25,9 @@
 #ifdef __HIP_PLATFORM_AMD__
 #if defined(__HIP_DEVICE_COMPILE__) && (!defined(__GFX9__) && !defined(__GFX11__))
 #error this architecture has no suitable tensor cores
+#elif defined(__HIP_DEVICE_COMPILE__) && NR_BITS == 8 && (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942))
+// CDNA3 does not support 16x16x16 instruction, only 16x16x32 and 32x32x16
+#error 8-bit mode is not supported on CDNA3
 #endif
 #else
 #if __CUDA_ARCH__ < (NR_BITS == 4 ? 730 : NR_BITS == 8 ? 720 : NR_BITS == 16 ? 700 : 0)
@@ -32,9 +35,15 @@
 #endif
 #endif

+#ifdef __HIP_PLATFORM_AMD__
+#if !defined(__GFX9__) && !defined(__GFX11__)
+#define PORTABLE // unknown architecture -> write visibilities in portable way (via shared memory)
+#endif
+#else
 #if __CUDA_ARCH__ != 700 && __CUDA_ARCH__ != 720 && __CUDA_ARCH__ != 750 && __CUDA_ARCH__ != 800 && __CUDA_ARCH__ != 860 && __CUDA_ARCH__ != 870 && __CUDA_ARCH__ != 890 && __CUDA_ARCH__ != 900
 #define PORTABLE // unknown architecture -> write visibilities in portable way (via shared memory)
 #endif
+#endif

 #if NR_RECEIVERS_PER_BLOCK != 32 && NR_RECEIVERS_PER_BLOCK != 48 && NR_RECEIVERS_PER_BLOCK != 64
 #error unsupported NR_RECEIVERS_PER_BLOCK
@@ -294,13 +303,20 @@ CUSTOM_STORE_VISIBILITY

 template <bool add> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned polY, unsigned polX, Visibility visibility)
 {
-  if (add) {
+  if (add)
    visibilities[channel][baseline][polY][polX] += visibility;
-
-  } else
+  else
    visibilities[channel][baseline][polY][polX]  = visibility;
 }

+template <bool add> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned polY, unsigned polX, unsigned c, Value value)
+{
+  if (add)
+    reinterpret_cast<Value *>(&visibilities[channel][baseline][polY][polX])[c] += value;
+  else
+    reinterpret_cast<Value *>(&visibilities[channel][baseline][polY][polX])[c]  = value;
+}
+
 #endif


@@ -311,6 +327,13 @@ template <bool add, typename T> __device__ inline void storeVisibility(Visibilit
 }


+template <bool add, typename T> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned recvY, unsigned recvX, unsigned polY, unsigned polX, unsigned c, bool skipCheckY, bool skipCheckX, T sum)
+{
+  if ((skipCheckY || recvY <= recvX) && (skipCheckX || recvX < NR_RECEIVERS))
+    storeVisibility<add>(visibilities, channel, baseline, polY, polX, c, sum);
+}
+
+
 template <bool add>__device__ inline void storeVisibilities(Visibilities visibilities, unsigned channel, unsigned firstReceiverY, unsigned firstReceiverX, unsigned y, unsigned x, bool skipCheckY, bool skipCheckX, const Sum &sum, ScratchSpace scratchSpace[], unsigned warp)
 {
 #if defined PORTABLE
@@ -351,6 +374,33 @@ template <bool add>__device__ inline void storeVisibilities(Visibilities visibil
        visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX];
 #endif
 #else
+#ifdef __HIP_PLATFORM_AMD__
+#if NR_BITS == 8 || NR_BITS == 16
+    for (unsigned i = 0; i < sum.num_elements; i++) {
+#if defined __GFX9__
+        unsigned row = 4 * (threadIdx.x / 16) + (i % 4);
+#elif defined __GFX11__
+        unsigned row = (threadIdx.x / 16) + 2 * i;
+#endif
+        unsigned col = threadIdx.x % 16;
+
+        unsigned recvY = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + row / 2;
+        unsigned polY = row % 2;
+
+        unsigned recvX = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + col / 4;
+        unsigned polX = (col / 2) % 2;
+        unsigned c = col % 2;
+
+        unsigned baseline = (recvX * (recvX + 1) / 2) + recvY;
+
+        storeVisibility<add>(visibilities, channel, baseline, recvY, recvX, polY, polX, c, skipCheckY, skipCheckX, sum.x[i]);
+        // ToDo: Figure out why the results are wrong in some cases without this sync. e.g. CorrelatorTest -b 16 -c 1 -n 33 -N 32 -r 1 -R 1 -t 8
+#if NR_BITS == 16
+        __syncthreads();
+#endif
+    }
+#endif
+#else
 #if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16)
  unsigned recvY    = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4);
  unsigned recvX    = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2);
@@ -381,6 +431,7 @@ template <bool add>__device__ inline void storeVisibilities(Visibilities visibil
 #endif
 #endif
 #endif
+#endif
 }