diff --git a/test/Benchmark/Benchmark.cc b/test/Benchmark/Benchmark.cc index 6798dbcb003fb9f28dd06362f486f5c59be4c3b1..54b3792effe4e9961432f6aa5828be41d3b455b9 100644 --- a/test/Benchmark/Benchmark.cc +++ b/test/Benchmark/Benchmark.cc @@ -8,6 +8,8 @@ #include <cstring> #include <iostream> +#include <cuda_fp8.h> + #include <cudawrappers/nvrtc.hpp> #define GNU_SOURCE @@ -28,7 +30,7 @@ Benchmark::Benchmark() using Format = tcc::Format; - for (Format format : { Format::fp16, Format::e4m3, Format::e5m2, Format::i8, Format::i4 }) + for (Format format : { Format::fp16, Format::e4m3, Format::i8, Format::i4 }) // e5m2 not tested separately, as it performs equal to e4m3 #pragma omp for collapse(2) schedule(dynamic) ordered for (unsigned nrReceivers = 1; nrReceivers <= 576; nrReceivers ++) for (unsigned nrReceiversPerBlock = 32; nrReceiversPerBlock <= 64; nrReceiversPerBlock += 16) @@ -43,6 +45,11 @@ Benchmark::Benchmark() break; + case Format::e4m3 : if (capability >= 90) + doTest<std::complex<__nv_fp8_e4m3>, std::complex<float>>(format, nrReceiversPerBlock, nrReceivers); + + break; + case Format::fp16 : if (capability >= 70) doTest<std::complex<__half>, std::complex<float>>(format, nrReceiversPerBlock, nrReceivers); diff --git a/test/Benchmark/Benchmark.h b/test/Benchmark/Benchmark.h index caac2a6879e25dc49300fb56bf0ae723082f7b83..50d38f1b48da0673d4d2e67e4f9e7cfca5c397e5 100644 --- a/test/Benchmark/Benchmark.h +++ b/test/Benchmark/Benchmark.h @@ -37,11 +37,12 @@ template<> std::complex<int8_t> Benchmark::randomValue<std::complex<int8_t>>() } -template<> std::complex<__half> Benchmark::randomValue<std::complex<__half>>() +template<typename SampleType> SampleType Benchmark::randomValue() { - return std::complex<__half>(drand48() - .5, drand48() - .5); + return SampleType((typename SampleType::value_type) (drand48() - .5), (typename SampleType::value_type) (drand48() - .5)); } + template <typename VisibilityType> bool Benchmark::approximates(const VisibilityType &a, const VisibilityType &b) const { return a == b;