diff --git a/README.md b/README.md index a1ee014fe75b6d980bdec002227e6d7fb0660a59..6ecad7da010efb0083ecae0b21bd49235ff89139 100644 --- a/README.md +++ b/README.md @@ -32,36 +32,49 @@ how to use the CUDA driver API (wrappers); and OpenCL program. `test/CorrelatorTest/CorrelatorTest.cc` is a much more versatile, robust (and complex) example than `test/SimpleExample/SimpleExample.cu`. -Input and output data types are defined as follows: +The TCC accepts the following input data types: +- half precision floating point (a.k.a. fp16), starting from Volta (sm\_70) +- e4m3 and e5m2 (a.k.a. fp8), starting from Hopper (sm\_90) +- 8-bit integers (i8), starting from the Jetson Xavier (sm\_72) +- 4-bit integers (i4), only natively supported on Ampere and Ada ``` -#if NR_BITS == 4 +#if INPUT_FORMAT == FORMAT_I4 +#define NR_TIMES_PER_BLOCK 32 typedef complex_int4_t Sample; typedef std::complex<int32_t> Visibility; -#elif NR_BITS == 8 +#elif INPUT_FORMAT == FORMAT_I8 +#define NR_TIMES_PER_BLOCK 16 typedef std::complex<int8_t> Sample; typedef std::complex<int32_t> Visibility; -#elif NR_BITS == 16 +#elif INPUT_FORMAT == FORMAT_E4M3 +#define NR_TIMES_PER_BLOCK 16 +typedef std::complex<__nv_fp8_e4m3> Sample; +typedef std::complex<float> Visibility; +#elif INPUT_FORMAT == FORMAT_E5M2 +#define NR_TIMES_PER_BLOCK 16 +typedef std::complex<__nv_fp8_e5m2> Sample; +typedef std::complex<float> Visibility; +#elif INPUT_FORMAT == FORMAT_FP16 +#define NR_TIMES_PER_BLOCK 8 typedef std::complex<__half> Sample; typedef std::complex<float> Visibility; #endif -#define NR_TIMES_PER_BLOCK (128 / NR_BITS) typedef Sample Samples[NR_CHANNELS][NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK][NR_RECEIVERS][NR_POLARIZATIONS][NR_TIMES_PER_BLOCK]; typedef Visibility Visibilities[NR_CHANNELS][NR_BASELINES][NR_POLARIZATIONS][NR_POLARIZATIONS]; ``` -Note that in 4-bit and 8-bit mode, the input samples may not contain -8 or -128 +Note that with FORMAT\_I4 and FORMAT\_I8, the input samples may not contain -8 or -128 respectively, as these values cannot be conjugated properly. The input data type (`Samples`) is a weird format, but this seemed to be the only format that yields good performance (tensor cores are very unforgiving). Limitations: - `NR_POLARIZATIONS` must be 2 -- `NR_BITS` must be 4, 8, or 16 -- the amount of samples over which is integrated) must be a multiple of 128 / `NR_BITS` - (i.e., 32, 16, or 8 for 4-bit, 8-bit, or 16-bit input, respectively). +- the amount of samples over which is integrated must be a multiple of +NR\_TIMES\_PER\_BLOCK. ## Building, testing, and installation Clone the repository: