diff --git a/RTCP/Cobalt/GPUProc/share/gpu/kernels/DelayAndBandPass.cu b/RTCP/Cobalt/GPUProc/share/gpu/kernels/DelayAndBandPass.cu
index 2f1cd6c49987a59df0af91e4f4762344cba1f158..2f0e4a3029f5077dce06ee730654fdeb52bc9eb6 100644
--- a/RTCP/Cobalt/GPUProc/share/gpu/kernels/DelayAndBandPass.cu
+++ b/RTCP/Cobalt/GPUProc/share/gpu/kernels/DelayAndBandPass.cu
@@ -181,7 +181,9 @@ extern "C" {
 #if defined DELAY_COMPENSATION
     DelaysType delaysAtBegin = (DelaysType)delaysAtBeginPtr;
     DelaysType delaysAfterEnd = (DelaysType)delaysAfterEndPtr;
+#if not defined DOPPLER_CORRECTION
     Phase0sType phase0s = (Phase0sType)phase0sPtr;
+#endif
 
     /*
     * Delay compensation means rotating the phase of each sample BACK.
@@ -222,8 +224,15 @@ extern "C" {
     // Calculate the angles to rotate for for the first and (beyond the) last sample.
     //
     // We need to undo the delay, so we rotate BACK, resulting in a negative constant factor.
+#if defined DOPPLER_CORRECTION   
+    // Since Doppler correction has already been applied at subbandFrequency,
+    // we only correct the increment for each channel frequency
+    const double2 phiAtBegin = -2.0 * M_PI * (frequency-subbandFrequency) * delayAtBegin;
+    const double2 phiAfterEnd = -2.0 * M_PI * (frequency-subbandFrequency) * delayAfterEnd;
+#else
     const double2 phiAtBegin = -2.0 * M_PI * frequency * delayAtBegin - (*phase0s)[delayIdx];
     const double2 phiAfterEnd = -2.0 * M_PI * frequency * delayAfterEnd - (*phase0s)[delayIdx];
+#endif
 #endif
 
     for (unsigned time = timeStart; time < NR_SAMPLES_PER_CHANNEL; time += timeInc)
diff --git a/RTCP/Cobalt/GPUProc/share/gpu/kernels/FIR_Filter.cu b/RTCP/Cobalt/GPUProc/share/gpu/kernels/FIR_Filter.cu
index a86868ed1df2b57bb26acd0f385e639a6d370784..8cdaaace6efbee13df7ae6d926f92c87fa4e4fba 100644
--- a/RTCP/Cobalt/GPUProc/share/gpu/kernels/FIR_Filter.cu
+++ b/RTCP/Cobalt/GPUProc/share/gpu/kernels/FIR_Filter.cu
@@ -98,6 +98,7 @@ typedef float (*FilteredDataType)[NR_STABS][NR_POLARIZATIONS][NR_SAMPLES_PER_CHA
 typedef const float (*WeightsType)[NR_CHANNELS][NR_TAPS];
 
 #ifdef DOPPLER_CORRECTION
+#include<assert.h>
 // this is faster than doing "pol ? sin(phi) : cos(phi)"
 // because that statement forces CUDA to still compute both
 // as GPUs always compute both branches.
@@ -112,6 +113,24 @@ inline __device__ float sincos_f2f_select(float phi, int pol)
 #define NR_SAMPLES_PER_SUBBAND (NR_SAMPLES_PER_CHANNEL * NR_CHANNELS)
 typedef  const double(*DelaysType)[1][NR_STABS][NR_POLARIZATIONS]; // 2 Polarizations; in seconds
 typedef  const double(*Phase0sType)[NR_STABS][NR_POLARIZATIONS]; // 2 Polarizations; in radians
+
+inline __device__ float correction_factor(unsigned time, double phiAtBegin, double phiAfterEnd, int ri, int pol, unsigned channel)
+{
+    // Offset of this sample between begin and end.
+    const double blockOffset = double(time * NR_CHANNELS + channel) / NR_SAMPLES_PER_SUBBAND;
+    //const double blockOffset = double(channel * NR_SAMPLES_PER_CHANNEL + time) / (double)NR_SAMPLES_PER_SUBBAND;
+    // Interpolate the required phase rotation for this sample.
+    //
+    // Single precision angle + sincos is measured to be good enough (2013-11-20).
+    // Note that we do the interpolation in double precision still.
+    // We can afford to if we keep this kernel memory bound.
+    const float phi = phiAtBegin  * (1.0 - blockOffset) + phiAfterEnd * blockOffset;
+    //if (phi>1e3) {
+    // printf("%d %d %e %e ",time,channel,blockOffset,phi);
+    //}
+    return sincos_f2f_select(-phi, (ri+pol)%2);
+}
+#define FACTOR(time) correction_factor(time, phiAtBegin, phiAfterEnd, ri, pol, channel)
 #endif /* DOPPLER_CORRECTION */
 
 /*!
@@ -264,26 +283,15 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
   for (unsigned time = 0; time < NR_SAMPLES_PER_CHANNEL; time += NR_TAPS) 
   {
-#if defined DOPPLER_CORRECTION
-    // Offset of this sample between begin and end.
-    const double blockOffset = double(time * NR_SAMPLES_PER_CHANNEL + channel) / NR_SAMPLES_PER_SUBBAND;
-    // Interpolate the required phase rotation for this sample.
-    //
-    // Single precision angle + sincos is measured to be good enough (2013-11-20).
-    // Note that we do the interpolation in double precision still.
-    // We can afford to if we keep this kernel memory bound.
-    const float phi = phiAtBegin  * (1.0 - blockOffset) + phiAfterEnd * blockOffset;
-    const float factor = sincos_f2f_select(phi, pol);
-#endif
 
 #if defined DOPPLER_CORRECTION
-    delayLine_sF = convertIntToFloat(SAMPLE(time + 0))*factor;
+    delayLine_sF = convertIntToFloat(SAMPLE(time + 0))*FACTOR(time);
 #else
     delayLine_sF = convertIntToFloat(SAMPLE(time + 0));
 #endif
     sum_s0 = weights_sF * delayLine_s0;
 #if defined DOPPLER_CORRECTION
-    delayLine_s0 = convertIntToFloat(SAMPLE(time + 1))*factor;
+    delayLine_s0 = convertIntToFloat(SAMPLE(time + 1))*FACTOR(time+1);
 #else
     delayLine_s0 = convertIntToFloat(SAMPLE(time + 1));
 #endif
@@ -306,7 +314,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_s1 = weights_sF * delayLine_s1;
 #if defined DOPPLER_CORRECTION
-    delayLine_s1 = convertIntToFloat(SAMPLE(time + 2))*factor;
+    delayLine_s1 = convertIntToFloat(SAMPLE(time + 2))*FACTOR(time+2);
 #else
     delayLine_s1 = convertIntToFloat(SAMPLE(time + 2));
 #endif
@@ -329,7 +337,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_s2 = weights_sF * delayLine_s2;
 #if defined DOPPLER_CORRECTION
-    delayLine_s2 = convertIntToFloat(SAMPLE(time + 3))*factor;
+    delayLine_s2 = convertIntToFloat(SAMPLE(time + 3))*FACTOR(time+3);
 #else
     delayLine_s2 = convertIntToFloat(SAMPLE(time + 3));
 #endif
@@ -352,7 +360,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_s3 = weights_sF * delayLine_s3;
 #if defined DOPPLER_CORRECTION
-    delayLine_s3 = convertIntToFloat(SAMPLE(time + 4))*factor;
+    delayLine_s3 = convertIntToFloat(SAMPLE(time + 4))*FACTOR(time+4);
 #else
     delayLine_s3 = convertIntToFloat(SAMPLE(time + 4));
 #endif
@@ -375,7 +383,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_s4 = weights_sF * delayLine_s4;
 #if defined DOPPLER_CORRECTION
-    delayLine_s4 = convertIntToFloat(SAMPLE(time + 5))*factor;
+    delayLine_s4 = convertIntToFloat(SAMPLE(time + 5))*FACTOR(time+5);
 #else
     delayLine_s4 = convertIntToFloat(SAMPLE(time + 5));
 #endif
@@ -398,7 +406,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_s5 = weights_sF * delayLine_s5;
 #if defined DOPPLER_CORRECTION
-    delayLine_s5 = convertIntToFloat(SAMPLE(time + 6))*factor;
+    delayLine_s5 = convertIntToFloat(SAMPLE(time + 6))*FACTOR(time+6);
 #else
     delayLine_s5 = convertIntToFloat(SAMPLE(time + 6));
 #endif
@@ -421,7 +429,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_s6 = weights_sF * delayLine_s6;
 #if defined DOPPLER_CORRECTION
-    delayLine_s6 = convertIntToFloat(SAMPLE(time + 7))*factor;
+    delayLine_s6 = convertIntToFloat(SAMPLE(time + 7))*FACTOR(time+7);
 #else
     delayLine_s6 = convertIntToFloat(SAMPLE(time + 7));
 #endif
@@ -444,7 +452,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_s7 = weights_sF * delayLine_s7;
 #if defined DOPPLER_CORRECTION
-    delayLine_s7 = convertIntToFloat(SAMPLE(time + 8))*factor;
+    delayLine_s7 = convertIntToFloat(SAMPLE(time + 8))*FACTOR(time+8);
 #else
     delayLine_s7 = convertIntToFloat(SAMPLE(time + 8));
 #endif
@@ -467,7 +475,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_s8 = weights_sF * delayLine_s8;
 #if defined DOPPLER_CORRECTION
-    delayLine_s8 = convertIntToFloat(SAMPLE(time + 9))*factor;
+    delayLine_s8 = convertIntToFloat(SAMPLE(time + 9))*FACTOR(time+9);
 #else
     delayLine_s8 = convertIntToFloat(SAMPLE(time + 9));
 #endif
@@ -490,7 +498,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_s9 = weights_sF * delayLine_s9;
 #if defined DOPPLER_CORRECTION
-    delayLine_s9 = convertIntToFloat(SAMPLE(time + 10))*factor;
+    delayLine_s9 = convertIntToFloat(SAMPLE(time + 10))*FACTOR(time+10);
 #else
     delayLine_s9 = convertIntToFloat(SAMPLE(time + 10));
 #endif
@@ -513,7 +521,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_sA = weights_sF * delayLine_sA;
 #if defined DOPPLER_CORRECTION
-    delayLine_sA = convertIntToFloat(SAMPLE(time + 11))*factor;
+    delayLine_sA = convertIntToFloat(SAMPLE(time + 11))*FACTOR(time+11);
 #else
     delayLine_sA = convertIntToFloat(SAMPLE(time + 11));
 #endif
@@ -536,7 +544,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_sB = weights_sF * delayLine_sB;
 #if defined DOPPLER_CORRECTION
-    delayLine_sB = convertIntToFloat(SAMPLE(time + 12))*factor;
+    delayLine_sB = convertIntToFloat(SAMPLE(time + 12))*FACTOR(time+12);
 #else
     delayLine_sB = convertIntToFloat(SAMPLE(time + 12));
 #endif
@@ -559,7 +567,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_sC = weights_sF * delayLine_sC;
 #if defined DOPPLER_CORRECTION
-    delayLine_sC = convertIntToFloat(SAMPLE(time + 13))*factor;
+    delayLine_sC = convertIntToFloat(SAMPLE(time + 13))*FACTOR(time+13);
 #else
     delayLine_sC = convertIntToFloat(SAMPLE(time + 13));
 #endif
@@ -582,7 +590,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_sD = weights_sF * delayLine_sD;
 #if defined DOPPLER_CORRECTION
-    delayLine_sD = convertIntToFloat(SAMPLE(time + 14))*factor;
+    delayLine_sD = convertIntToFloat(SAMPLE(time + 14))*FACTOR(time+14);
 #else
     delayLine_sD = convertIntToFloat(SAMPLE(time + 14));
 #endif
@@ -605,7 +613,7 @@ __global__ void FIR_filter( void *filteredDataPtr,
 
     sum_sE = weights_sF * delayLine_sE;
 #if defined DOPPLER_CORRECTION
-    delayLine_sE = convertIntToFloat(SAMPLE(time + 15))*factor;
+    delayLine_sE = convertIntToFloat(SAMPLE(time + 15))*FACTOR(time+15);
 #else
     delayLine_sE = convertIntToFloat(SAMPLE(time + 15));
 #endif
@@ -649,17 +657,8 @@ __global__ void FIR_filter( void *filteredDataPtr,
   {
 #if defined DOPPLER_CORRECTION 
     const unsigned time = NR_SAMPLES_PER_CHANNEL - (NR_TAPS - 1) + tap;
-    // Offset of this sample between begin and end.
-    const double blockOffset = double(time * NR_SAMPLES_PER_CHANNEL + channel) / NR_SAMPLES_PER_SUBBAND;
-    // Interpolate the required phase rotation for this sample.
-    //
-    // Single precision angle + sincos is measured to be good enough (2013-11-20).
-    // Note that we do the interpolation in double precision still.
-    // We can afford to if we keep this kernel memory bound.
-    const float phi = phiAtBegin  * (1.0 - blockOffset) + phiAfterEnd * blockOffset;
-    const float factor = sincos_f2f_select(phi, pol);
     (*historyData)[subbandIdx][station][tap][channel][pol_ri] = // FIXME : subbandIdx=1 assumed here
-      convertIntToFloat(SAMPLE(time))*factor;
+      convertIntToFloat(SAMPLE(time))*FACTOR(time);
 #else
     (*historyData)[subbandIdx][station][tap][channel][pol_ri] = // FIXME : subbandIdx=1 assumed here
       SAMPLE(NR_SAMPLES_PER_CHANNEL - (NR_TAPS - 1) + tap);