From e0cde58d52bf2fdb27b112b49343803649e6d27c Mon Sep 17 00:00:00 2001
From: John Romein <romein@astron.nl>
Date: Tue, 15 Apr 2025 14:04:11 +0200
Subject: [PATCH] Updated for e4m3/e5m2 support.

---
 README.md | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index a1ee014..6ecad7d 100644
--- a/README.md
+++ b/README.md
@@ -32,36 +32,49 @@ how to use the CUDA driver API (wrappers); and
 OpenCL program.  `test/CorrelatorTest/CorrelatorTest.cc` is a much more versatile,
 robust (and complex) example than `test/SimpleExample/SimpleExample.cu`.
 
-Input and output data types are defined as follows:
+The TCC accepts the following input data types:
+- half precision floating point (a.k.a. fp16), starting from Volta (sm\_70)
+- e4m3 and e5m2 (a.k.a. fp8), starting from Hopper (sm\_90)
+- 8-bit integers (i8), starting from the Jetson Xavier (sm\_72)
+- 4-bit integers (i4), only natively supported on Ampere and Ada
 
 ```
-#if NR_BITS == 4
+#if INPUT_FORMAT == FORMAT_I4
+#define NR_TIMES_PER_BLOCK    32
 typedef complex_int4_t        Sample;
 typedef std::complex<int32_t> Visibility;
-#elif NR_BITS == 8
+#elif INPUT_FORMAT == FORMAT_I8
+#define NR_TIMES_PER_BLOCK    16
 typedef std::complex<int8_t>  Sample;
 typedef std::complex<int32_t> Visibility;
-#elif NR_BITS == 16
+#elif INPUT_FORMAT == FORMAT_E4M3
+#define NR_TIMES_PER_BLOCK    16
+typedef std::complex<__nv_fp8_e4m3>  Sample;
+typedef std::complex<float>   Visibility;
+#elif INPUT_FORMAT == FORMAT_E5M2
+#define NR_TIMES_PER_BLOCK    16
+typedef std::complex<__nv_fp8_e5m2>  Sample;
+typedef std::complex<float>   Visibility;
+#elif INPUT_FORMAT == FORMAT_FP16
+#define NR_TIMES_PER_BLOCK    8
 typedef std::complex<__half>  Sample;
 typedef std::complex<float>   Visibility;
 #endif
 
-#define NR_TIMES_PER_BLOCK (128 / NR_BITS)
 
 typedef Sample Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK];
 typedef Visibility Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS];
 ```
 
-Note that in 4-bit and 8-bit mode, the input samples may not contain -8 or -128
+Note that with FORMAT\_I4 and FORMAT\_I8, the input samples may not contain -8 or -128
 respectively, as these values cannot be conjugated properly.
 The input data type (`Samples`) is a weird format, but this seemed to be the only
 format that yields good performance (tensor cores are very unforgiving).
 
 Limitations:
 - `NR_POLARIZATIONS` must be 2
-- `NR_BITS` must be 4, 8, or 16
-- the amount of samples over which is integrated) must be a multiple of 128 / `NR_BITS`
-  (i.e., 32, 16, or 8 for 4-bit, 8-bit, or 16-bit input, respectively).
+- the amount of samples over which is integrated must be a multiple of
+NR\_TIMES\_PER\_BLOCK.
 
 ## Building, testing, and installation
 Clone the repository:
-- 
GitLab