Optimize ComputeSmearterm

In the current implement, the ComputeSmearterm function introduces a lot of overhead in case freqsmear is set to true. Especially, the fabs computation is very expensive, as shown below:
       │      // FIXME: move this to a more appropriate location                                                                                       
       │      inline float ComputeSmearterm(double uvw, double halfwidth) {                                                                            
       │      float smearterm = static_cast<float>(uvw) * static_cast<float>(halfwidth);                                                               
       │      return (smearterm == 0.0f) ? 1.0f                                                                                                        
  0.04 │        vxorps      %xmm7,%xmm7,%xmm7                                                                                                          
       │        vmovsd      (%r14),%xmm1                                                                                                               
       │        vmovsd      vtable for benchmark::BenchmarkReporter+0x50,%xmm6                                                                         
       │        vsubsd      (%rax),%xmm1,%xmm1                                                                                                         
       │        mov         -0x158(%rbp),%rax                                                                                                          
  4.23 │        vmulsd      (%rax,%r13,8),%xmm6,%xmm0                                                                                                  
       │      float smearterm = static_cast<float>(uvw) * static_cast<float>(halfwidth);                                                               
  0.00 │        vcvtsd2ss   %xmm1,%xmm1,%xmm1                                                                                                          
       │        vcvtsd2ss   %xmm0,%xmm0,%xmm0                                                                                                          
  0.02 │        vmulss      %xmm0,%xmm1,%xmm1                                                                                                          
       │      return (smearterm == 0.0f) ? 1.0f                                                                                                        
       │        vucomiss    %xmm7,%xmm1                                                                                                                
       │      ↓ jp          2cd                                                                                                                        
       │      ↓ je          310                                                                                                                        
       │      using ::sin;                                                                                                                             
       │                                                                                                                                               
       │      #ifndef __CORRECT_ISO_CPP_MATH_H_PROTO                                                                                                   
       │      inline _GLIBCXX_CONSTEXPR float                                                                                                          
       │      sin(float __x)                                                                                                                           
       │      { return __builtin_sinf(__x); }                                                                                                          
  3.82 │ 2cd:   vmovaps     %xmm1,%xmm0                                                                                                                
       │        vmovss      %xmm1,-0x170(%rbp)                                                                                                         
       │      → callq       sinf@plt                                                                                                                   
       │      : std::fabs(std::sin(smearterm) / smearterm);                                                                                            
       │        vmovss      -0x170(%rbp),%xmm1                                                                                                         
  0.13 │        mov         -0x168(%rbp),%rax                                                                                                          
  0.07 │        vdivss      %xmm1,%xmm0,%xmm0                                                                                                          
       │      { return __builtin_fabsf(__x); }                                                                                                         
 46.37 │        vandps      0xfd67(%rip),%xmm0,%xmm0        # 48f980 <std::__detail::__to_chars_10_impl<unsigned long>(char*, unsigned int, unsigned lo
  7.23 │        vcvtss2sd   %xmm0,%xmm0,%xmm0                                                                                                          
  3.51 │        vmovsd      %xmm0,(%rax,%r13,8)                                                                                                        
       │      for (size_t ch = 0; ch < nchannels; ++ch) {                                                                                              
 10.59 │        inc         %r13
Ideas:
We could move to a xtensor based implementation that utilizes SIMD based computation.
Edited May 09, 2025 by Wiebe van Breukelen