Optimize ComputeSmearterm
In the current implement, the ComputeSmearterm
function introduces a lot of overhead in case freqsmear
is set to true
. Especially, the fabs computation is very expensive, as shown below:
│ // FIXME: move this to a more appropriate location
│ inline float ComputeSmearterm(double uvw, double halfwidth) {
│ float smearterm = static_cast<float>(uvw) * static_cast<float>(halfwidth);
│ return (smearterm == 0.0f) ? 1.0f
0.04 │ vxorps %xmm7,%xmm7,%xmm7
│ vmovsd (%r14),%xmm1
│ vmovsd vtable for benchmark::BenchmarkReporter+0x50,%xmm6
│ vsubsd (%rax),%xmm1,%xmm1
│ mov -0x158(%rbp),%rax
4.23 │ vmulsd (%rax,%r13,8),%xmm6,%xmm0
│ float smearterm = static_cast<float>(uvw) * static_cast<float>(halfwidth);
0.00 │ vcvtsd2ss %xmm1,%xmm1,%xmm1
│ vcvtsd2ss %xmm0,%xmm0,%xmm0
0.02 │ vmulss %xmm0,%xmm1,%xmm1
│ return (smearterm == 0.0f) ? 1.0f
│ vucomiss %xmm7,%xmm1
│ ↓ jp 2cd
│ ↓ je 310
│ using ::sin;
│
│ #ifndef __CORRECT_ISO_CPP_MATH_H_PROTO
│ inline _GLIBCXX_CONSTEXPR float
│ sin(float __x)
│ { return __builtin_sinf(__x); }
3.82 │ 2cd: vmovaps %xmm1,%xmm0
│ vmovss %xmm1,-0x170(%rbp)
│ → callq sinf@plt
│ : std::fabs(std::sin(smearterm) / smearterm);
│ vmovss -0x170(%rbp),%xmm1
0.13 │ mov -0x168(%rbp),%rax
0.07 │ vdivss %xmm1,%xmm0,%xmm0
│ { return __builtin_fabsf(__x); }
46.37 │ vandps 0xfd67(%rip),%xmm0,%xmm0 # 48f980 <std::__detail::__to_chars_10_impl<unsigned long>(char*, unsigned int, unsigned lo
7.23 │ vcvtss2sd %xmm0,%xmm0,%xmm0
3.51 │ vmovsd %xmm0,(%rax,%r13,8)
│ for (size_t ch = 0; ch < nchannels; ++ch) {
10.59 │ inc %r13
Ideas:
- We could move to a
xtensor
based implementation that utilizes SIMD based computation.
Edited by Wiebe van Breukelen