Skip to content
Snippets Groups Projects

Simplify the FDD dedisperse_kernel without inline PTX and operator device functions

Merged Bram Veenboer requested to merge update-fdd-kernel into main
1 file
+ 10
49
Compare changes
  • Side-by-side
  • Inline
@@ -22,54 +22,15 @@ __constant__ dedisp_float c_delay_table[DEDISP_MAX_NCHANS];
/*
* Helper functions
*/
// Multiply two float2 operands
inline __device__ float2 operator*(float2 a, float2 b) {
float2 c;
asm("mul.f32 %0,%1,%2;" : "=f"(c.x) : "f"(a.x), "f"(b.x));
asm("mul.f32 %0,%1,%2;" : "=f"(c.y) : "f"(a.x), "f"(b.y));
asm("fma.rn.ftz.f32 %0,%1,%2,%3;"
: "=f"(c.x)
: "f"(-a.y), "f"(b.y), "f"(c.x));
asm("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(c.y) : "f"(a.y), "f"(b.x), "f"(c.y));
return c;
}
// Add and assign two float2 operands
inline __device__ void operator+=(float2 &a, float2 b) {
a.x += b.x;
a.y += b.y;
}
// Multiply and assign two float2 operands
inline __device__ void operator*=(float2 &a, float2 b) {
float2 c = a * b;
a.x = c.x;
a.y = c.y;
}
// Multiply-and-accumulate (MAC) for complex operands
inline __device__ void cmac(float2 &a, float2 b, float2 c) {
asm("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(a.x) : "f"(b.x), "f"(c.x), "f"(a.x));
asm("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(a.y) : "f"(b.x), "f"(c.y), "f"(a.y));
asm("fma.rn.ftz.f32 %0,%1,%2,%3;"
: "=f"(a.x)
: "f"(-b.y), "f"(c.y), "f"(a.x));
asm("fma.rn.ftz.f32 %0,%1,%2,%3;" : "=f"(a.y) : "f"(b.y), "f"(c.x), "f"(a.y));
}
// Use the Special Function Unit (SFU) for the sine evaluation
inline __device__ float raw_sin(float a) {
float r;
asm("sin.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a));
return r;
a.x += b.x * c.x;
a.y += b.x * c.y;
a.x -= b.y * c.y;
a.y += b.y * c.x;
}
// Use the Special Function Unit (SFU) for the cosine evaluation
inline __device__ float raw_cos(float a) {
float r;
asm("cos.approx.ftz.f32 %0,%1;" : "=f"(r) : "f"(a));
return r;
inline __device__ float2 cmul(float2 a, float2 b) {
return {a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x};
}
/*
@@ -167,9 +128,9 @@ dedisperse_kernel(size_t nfreq, float dt, const float *d_spin_frequencies,
float phase0 = 2.0f * ((float)M_PI) * f * tdm0;
float phase1 = 2.0f * ((float)M_PI) * f * tdm1;
float phase_delta = phase1 - phase0;
phasors[i] = make_float2(raw_cos(phase0), raw_sin(phase0));
phasors[i] = make_float2(__cosf(phase0), __sinf(phase0));
phasors_delta[i] =
make_float2(raw_cos(phase_delta), raw_sin(phase_delta));
make_float2(__cosf(phase_delta), __sinf(phase_delta));
}
#pragma unroll
@@ -191,7 +152,7 @@ dedisperse_kernel(size_t nfreq, float dt, const float *d_spin_frequencies,
cmac(sums[i], sample, phasors[i]);
// Update phasor
phasors[i] *= phasors_delta[i];
phasors[i] = cmul(phasors[i], phasors_delta[i]);
}
} // end for ichan_inner
} else // Not using the extrapolation feature
@@ -217,7 +178,7 @@ dedisperse_kernel(size_t nfreq, float dt, const float *d_spin_frequencies,
float phase = 2.0f * ((float)M_PI) * f * tdm;
// Compute phasor
float2 phasor = make_float2(raw_cos(phase), raw_sin(phase));
float2 phasor = make_float2(__cosf(phase), __sinf(phase));
// Update sum
cmac(sums[i], sample, phasor);
Loading