diff --git a/.gitattributes b/.gitattributes
index 487a7678f0b1a64ca20d44a865dec0842d1bc3c6..48ca44cd2d84f678ccd26442d82be637eb783cc6 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -3657,7 +3657,6 @@ RTCP/GPUProc/src/Align.h -text
 RTCP/GPUProc/src/BandPass.cc -text
 RTCP/GPUProc/src/BandPass.h -text
 RTCP/GPUProc/src/BeamFormer/BeamFormer.cl -text
-RTCP/GPUProc/src/BeamFormer/BeamFormer.cl-0.ptx -text
 RTCP/GPUProc/src/BeamFormer/BeamFormer.cl.4x3 -text
 RTCP/GPUProc/src/BeamFormer/BeamFormer.cl.6x3 -text
 RTCP/GPUProc/src/BeamFormer/BeamFormer.cl.bak -text
@@ -3665,16 +3664,11 @@ RTCP/GPUProc/src/BeamFormer/BeamFormer.cl.not -text
 RTCP/GPUProc/src/BeamFormer/BeamFormer.cl.ok -text
 RTCP/GPUProc/src/BeamFormer/BeamFormer.cl.orig -text
 RTCP/GPUProc/src/BeamFormer/CoherentStokes.cl -text
-RTCP/GPUProc/src/BeamFormer/CoherentStokes.cl-0.ptx -text
 RTCP/GPUProc/src/BeamFormer/CoherentStokes.cl.ok -text
 RTCP/GPUProc/src/BeamFormer/Dedispersion.cl -text
-RTCP/GPUProc/src/BeamFormer/Dedispersion.cl-0.ptx -text
 RTCP/GPUProc/src/BeamFormer/IncoherentStokes.cl -text
-RTCP/GPUProc/src/BeamFormer/IncoherentStokes.cl-0.ptx -text
 RTCP/GPUProc/src/BeamFormer/IntToFloat.cl -text
-RTCP/GPUProc/src/BeamFormer/IntToFloat.cl-0.ptx -text
 RTCP/GPUProc/src/BeamFormer/Transpose.cl -text
-RTCP/GPUProc/src/BeamFormer/Transpose.cl-0.ptx -text
 RTCP/GPUProc/src/BeamletBuffer.cc -text
 RTCP/GPUProc/src/BeamletBuffer.h -text
 RTCP/GPUProc/src/BeamletBufferToComputeNode.cc -text
@@ -3682,14 +3676,11 @@ RTCP/GPUProc/src/BeamletBufferToComputeNode.h -text
 RTCP/GPUProc/src/CL/cl.hpp -text
 RTCP/GPUProc/src/CMakeLists.txt -text
 RTCP/GPUProc/src/Correlator.cl -text
-RTCP/GPUProc/src/Correlator.cl-0.ptx -text
 RTCP/GPUProc/src/DelayAndBandPass.cl -text
-RTCP/GPUProc/src/DelayAndBandPass.cl-0.ptx -text
 RTCP/GPUProc/src/Delays.cc -text
 RTCP/GPUProc/src/Delays.h -text
 RTCP/GPUProc/src/FFT.cl -text
 RTCP/GPUProc/src/FIR.cl -text
-RTCP/GPUProc/src/FIR.cl-0.ptx -text
 RTCP/GPUProc/src/FilterBank.cc -text
 RTCP/GPUProc/src/FilterBank.h -text
 RTCP/GPUProc/src/InputSection.cc -text
@@ -3700,7 +3691,6 @@ RTCP/GPUProc/src/LockedRanges.h -text
 RTCP/GPUProc/src/LogThread.cc -text
 RTCP/GPUProc/src/LogThread.h -text
 RTCP/GPUProc/src/NewCorrelator.cl -text
-RTCP/GPUProc/src/NewCorrelator.cl-0.ptx -text
 RTCP/GPUProc/src/OpenCL_Support.cc -text
 RTCP/GPUProc/src/OpenCL_Support.h -text
 RTCP/GPUProc/src/OpenMP_Support.h -text
@@ -3714,29 +3704,22 @@ RTCP/GPUProc/src/Scheduling.cc -text
 RTCP/GPUProc/src/Scheduling.h -text
 RTCP/GPUProc/src/SlidingPointer.h -text
 RTCP/GPUProc/src/UHEP/BeamFormer.cl -text
-RTCP/GPUProc/src/UHEP/BeamFormer.cl-0.ptx -text
 RTCP/GPUProc/src/UHEP/BeamFormer.cl.4groups -text
 RTCP/GPUProc/src/UHEP/BeamFormer.cl.hop -text
-RTCP/GPUProc/src/UHEP/BeamFormer.cl.hop-0.ptx -text
 RTCP/GPUProc/src/UHEP/BeamFormer.cl.not -text
 RTCP/GPUProc/src/UHEP/BeamFormer.cl.ok -text
 RTCP/GPUProc/src/UHEP/InvFFT.cl -text
-RTCP/GPUProc/src/UHEP/InvFFT.cl-0.ptx -text
 RTCP/GPUProc/src/UHEP/InvFIR.cl -text
-RTCP/GPUProc/src/UHEP/InvFIR.cl-0.ptx -text
 RTCP/GPUProc/src/UHEP/InvertedStationPPFWeights.cc -text
 RTCP/GPUProc/src/UHEP/InvertedStationPPFWeights.h -text
 RTCP/GPUProc/src/UHEP/Transpose.cl -text
-RTCP/GPUProc/src/UHEP/Transpose.cl-0.ptx -text
 RTCP/GPUProc/src/UHEP/Transpose.cl.ok -text
 RTCP/GPUProc/src/UHEP/Trigger.cl -text
-RTCP/GPUProc/src/UHEP/Trigger.cl-0.ptx -text
 RTCP/GPUProc/src/UHEP/Trigger.cl.8 -text
 RTCP/GPUProc/src/UHEP/Trigger.cl.ok -text
 RTCP/GPUProc/src/WallClockTime.h -text
 RTCP/GPUProc/src/fft2.cl -text
 RTCP/GPUProc/src/math.cl -text
-RTCP/GPUProc/src/octave-core -text
 RTCP/GPUProc/test/77_Stations.parset -text
 RTCP/GPUProc/test/AARTFAAC.parset -text
 RTCP/GPUProc/test/CMakeLists.txt -text
diff --git a/RTCP/GPUProc/src/BeamFormer/BeamFormer.cl-0.ptx b/RTCP/GPUProc/src/BeamFormer/BeamFormer.cl-0.ptx
deleted file mode 100644
index 7f1d211419f4937459be9ecf7a4c5dbc50a45446..0000000000000000000000000000000000000000
--- a/RTCP/GPUProc/src/BeamFormer/BeamFormer.cl-0.ptx
+++ /dev/null
@@ -1,607 +0,0 @@
-//
-// Generated by NVIDIA NVVM Compiler
-// Compiler built on Sat Sep 29 10:12:13 2012 (1348906333)
-// Driver 304.54
-//
-
-.version 3.0
-.target sm_30, texmode_independent
-.address_size 32
-
-.extern .shared .align 16 .b8 shr_4__local[4096];
-
-.entry complexVoltages(
-	.param .u32 .ptr .global .align 1 complexVoltages_param_0,
-	.param .u32 .ptr .global .align 1 complexVoltages_param_1,
-	.param .u32 .ptr .global .align 1 complexVoltages_param_2
-)
-{
-	.reg .f32 	%f<1124>;
-	.reg .pred 	%p<13>;
-	.reg .s32 	%r<206>;
-
-
-	ld.param.u32 	%r1, [complexVoltages_param_0];
-	ld.param.u32 	%r3, [complexVoltages_param_2];
-	// inline asm
-	mov.u32 	%r80, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r81, %tid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r82, %envreg5;
-	// inline asm
-	// inline asm
-	mov.u32 	%r83, %ntid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r84, %ctaid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r85, %tid.z;
-	// inline asm
-	add.s32 	%r87, %r85, %r82;
-	mad.lo.s32 	%r10, %r84, %r83, %r87;
-	shl.b32 	%r88, %r10, 10;
-	shl.b32 	%r89, %r81, 3;
-	add.s32 	%r90, %r88, %r89;
-	add.s32 	%r91, %r3, %r90;
-	ld.global.v2.f32 	{%f1028, %f1029}, [%r91];
-	ld.global.v2.f32 	{%f1034, %f1035}, [%r91+2097152];
-	ld.global.v2.f32 	{%f1040, %f1041}, [%r91+4194304];
-	ld.global.v2.f32 	{%f1046, %f1047}, [%r91+6291456];
-	ld.global.v2.f32 	{%f1052, %f1053}, [%r91+8388608];
-	ld.global.v2.f32 	{%f1058, %f1059}, [%r91+10485760];
-	ld.global.v2.f32 	{%f1064, %f1065}, [%r91+12582912];
-	ld.global.v2.f32 	{%f1070, %f1071}, [%r91+14680064];
-	ld.global.v2.f32 	{%f1076, %f1077}, [%r91+16777216];
-	ld.global.v2.f32 	{%f1082, %f1083}, [%r91+18874368];
-	ld.global.v2.f32 	{%f1088, %f1089}, [%r91+20971520];
-	ld.global.v2.f32 	{%f1094, %f1095}, [%r91+23068672];
-	ld.global.v2.f32 	{%f1100, %f1101}, [%r91+25165824];
-	ld.global.v2.f32 	{%f1106, %f1107}, [%r91+27262976];
-	ld.global.v2.f32 	{%f1112, %f1113}, [%r91+29360128];
-	ld.global.v2.f32 	{%f1118, %f1119}, [%r91+31457280];
-	shl.b32 	%r92, %r10, 16;
-	shl.b32 	%r93, %r81, 4;
-	add.s32 	%r94, %r92, %r93;
-	shl.b32 	%r95, %r80, 3;
-	add.s32 	%r96, %r94, %r95;
-	add.s32 	%r189, %r1, %r96;
-	mov.u32 	%r97, shr_4__local;
-	add.s32 	%r98, %r97, %r95;
-	add.s32 	%r12, %r98, 2048;
-	mov.u32 	%r182, 0;
-
-BB0_1:
-	mov.u32 	%r187, %r189;
-	mov.u32 	%r13, %r187;
-	shl.b32 	%r15, %r182, 4;
-	// inline asm
-	mov.u32 	%r99, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r100, %tid.y;
-	// inline asm
-	shl.b32 	%r17, %r100, 1;
-	add.s32 	%r184, %r17, %r99;
-	setp.gt.u32 	%p1, %r184, 255;
-	@%p1 bra 	BB0_4;
-
-	add.s32 	%r19, %r99, %r17;
-	and.b32  	%r102, %r19, 15;
-	add.s32 	%r20, %r15, %r102;
-	mov.u32 	%r183, 0;
-
-BB0_3:
-	mov.u32 	%r22, %r184;
-	add.s32 	%r103, %r19, %r183;
-	and.b32  	%r104, %r103, 65520;
-	shl.b32 	%r105, %r104, 16;
-	ld.param.u32 	%r179, [complexVoltages_param_1];
-	add.s32 	%r106, %r179, %r105;
-	shl.b32 	%r107, %r10, 9;
-	add.s32 	%r108, %r106, %r107;
-	shl.b32 	%r109, %r20, 4;
-	add.s32 	%r110, %r108, %r109;
-	shl.b32 	%r111, %r22, 4;
-	add.s32 	%r113, %r97, %r111;
-	ld.global.v4.f32 	{%f1024, %f1025, %f1026, %f1027}, [%r110];
-	st.shared.v4.f32 	[%r113], {%f1024, %f1025, %f1026, %f1027};
-	add.s32 	%r23, %r22, 256;
-	add.s32 	%r183, %r183, 256;
-	setp.gt.u32 	%p2, %r22, -257;
-	mov.u32 	%r184, %r23;
-	@%p2 bra 	BB0_3;
-
-BB0_4:
-	mov.u32 	%r186, %r12;
-	bar.sync 	0;
-	mov.u32 	%r185, 16;
-	mov.u32 	%r188, %r13;
-
-BB0_5:
-	mov.u32 	%r28, %r188;
-	ld.shared.v2.f32 	{%f798, %f799}, [%r186+-2048];
-	mov.f32 	%f1, 0f00000000;
-	fma.rn.ftz.f32 	%f802, %f1028, %f798, %f1;
-	fma.rn.ftz.f32 	%f803, %f1028, %f799, %f1;
-	neg.ftz.f32 	%f3, %f799;
-	fma.rn.ftz.f32 	%f810, %f1029, %f3, %f802;
-	fma.rn.ftz.f32 	%f811, %f1029, %f798, %f803;
-	ld.shared.v2.f32 	{%f814, %f815}, [%r186+-1792];
-	fma.rn.ftz.f32 	%f816, %f1034, %f814, %f810;
-	fma.rn.ftz.f32 	%f817, %f1034, %f815, %f811;
-	neg.ftz.f32 	%f6, %f815;
-	fma.rn.ftz.f32 	%f824, %f1035, %f6, %f816;
-	fma.rn.ftz.f32 	%f825, %f1035, %f814, %f817;
-	ld.shared.v2.f32 	{%f828, %f829}, [%r186+-1536];
-	fma.rn.ftz.f32 	%f830, %f1040, %f828, %f824;
-	fma.rn.ftz.f32 	%f831, %f1040, %f829, %f825;
-	neg.ftz.f32 	%f9, %f829;
-	fma.rn.ftz.f32 	%f838, %f1041, %f9, %f830;
-	fma.rn.ftz.f32 	%f839, %f1041, %f828, %f831;
-	ld.shared.v2.f32 	{%f842, %f843}, [%r186+-1280];
-	fma.rn.ftz.f32 	%f844, %f1046, %f842, %f838;
-	fma.rn.ftz.f32 	%f845, %f1046, %f843, %f839;
-	neg.ftz.f32 	%f12, %f843;
-	fma.rn.ftz.f32 	%f852, %f1047, %f12, %f844;
-	fma.rn.ftz.f32 	%f853, %f1047, %f842, %f845;
-	ld.shared.v2.f32 	{%f856, %f857}, [%r186+-1024];
-	fma.rn.ftz.f32 	%f858, %f1052, %f856, %f852;
-	fma.rn.ftz.f32 	%f859, %f1052, %f857, %f853;
-	neg.ftz.f32 	%f15, %f857;
-	fma.rn.ftz.f32 	%f866, %f1053, %f15, %f858;
-	fma.rn.ftz.f32 	%f867, %f1053, %f856, %f859;
-	ld.shared.v2.f32 	{%f870, %f871}, [%r186+-768];
-	fma.rn.ftz.f32 	%f872, %f1058, %f870, %f866;
-	fma.rn.ftz.f32 	%f873, %f1058, %f871, %f867;
-	neg.ftz.f32 	%f18, %f871;
-	fma.rn.ftz.f32 	%f880, %f1059, %f18, %f872;
-	fma.rn.ftz.f32 	%f881, %f1059, %f870, %f873;
-	ld.shared.v2.f32 	{%f884, %f885}, [%r186+-512];
-	fma.rn.ftz.f32 	%f886, %f1064, %f884, %f880;
-	fma.rn.ftz.f32 	%f887, %f1064, %f885, %f881;
-	neg.ftz.f32 	%f21, %f885;
-	fma.rn.ftz.f32 	%f894, %f1065, %f21, %f886;
-	fma.rn.ftz.f32 	%f895, %f1065, %f884, %f887;
-	ld.shared.v2.f32 	{%f898, %f899}, [%r186+-256];
-	fma.rn.ftz.f32 	%f900, %f1070, %f898, %f894;
-	fma.rn.ftz.f32 	%f901, %f1070, %f899, %f895;
-	neg.ftz.f32 	%f24, %f899;
-	fma.rn.ftz.f32 	%f908, %f1071, %f24, %f900;
-	fma.rn.ftz.f32 	%f909, %f1071, %f898, %f901;
-	ld.shared.v2.f32 	{%f912, %f913}, [%r186];
-	fma.rn.ftz.f32 	%f914, %f1076, %f912, %f908;
-	fma.rn.ftz.f32 	%f915, %f1076, %f913, %f909;
-	neg.ftz.f32 	%f27, %f913;
-	fma.rn.ftz.f32 	%f922, %f1077, %f27, %f914;
-	fma.rn.ftz.f32 	%f923, %f1077, %f912, %f915;
-	ld.shared.v2.f32 	{%f926, %f927}, [%r186+256];
-	fma.rn.ftz.f32 	%f928, %f1082, %f926, %f922;
-	fma.rn.ftz.f32 	%f929, %f1082, %f927, %f923;
-	neg.ftz.f32 	%f30, %f927;
-	fma.rn.ftz.f32 	%f936, %f1083, %f30, %f928;
-	fma.rn.ftz.f32 	%f937, %f1083, %f926, %f929;
-	ld.shared.v2.f32 	{%f940, %f941}, [%r186+512];
-	fma.rn.ftz.f32 	%f942, %f1088, %f940, %f936;
-	fma.rn.ftz.f32 	%f943, %f1088, %f941, %f937;
-	neg.ftz.f32 	%f33, %f941;
-	fma.rn.ftz.f32 	%f950, %f1089, %f33, %f942;
-	fma.rn.ftz.f32 	%f951, %f1089, %f940, %f943;
-	ld.shared.v2.f32 	{%f954, %f955}, [%r186+768];
-	fma.rn.ftz.f32 	%f956, %f1094, %f954, %f950;
-	fma.rn.ftz.f32 	%f957, %f1094, %f955, %f951;
-	neg.ftz.f32 	%f36, %f955;
-	fma.rn.ftz.f32 	%f964, %f1095, %f36, %f956;
-	fma.rn.ftz.f32 	%f965, %f1095, %f954, %f957;
-	ld.shared.v2.f32 	{%f968, %f969}, [%r186+1024];
-	fma.rn.ftz.f32 	%f970, %f1100, %f968, %f964;
-	fma.rn.ftz.f32 	%f971, %f1100, %f969, %f965;
-	neg.ftz.f32 	%f39, %f969;
-	fma.rn.ftz.f32 	%f978, %f1101, %f39, %f970;
-	fma.rn.ftz.f32 	%f979, %f1101, %f968, %f971;
-	ld.shared.v2.f32 	{%f982, %f983}, [%r186+1280];
-	fma.rn.ftz.f32 	%f984, %f1106, %f982, %f978;
-	fma.rn.ftz.f32 	%f985, %f1106, %f983, %f979;
-	neg.ftz.f32 	%f42, %f983;
-	fma.rn.ftz.f32 	%f992, %f1107, %f42, %f984;
-	fma.rn.ftz.f32 	%f993, %f1107, %f982, %f985;
-	ld.shared.v2.f32 	{%f996, %f997}, [%r186+1536];
-	fma.rn.ftz.f32 	%f998, %f1112, %f996, %f992;
-	fma.rn.ftz.f32 	%f999, %f1112, %f997, %f993;
-	neg.ftz.f32 	%f45, %f997;
-	fma.rn.ftz.f32 	%f1006, %f1113, %f45, %f998;
-	fma.rn.ftz.f32 	%f1007, %f1113, %f996, %f999;
-	ld.shared.v2.f32 	{%f1010, %f1011}, [%r186+1792];
-	fma.rn.ftz.f32 	%f1012, %f1118, %f1010, %f1006;
-	fma.rn.ftz.f32 	%f1013, %f1118, %f1011, %f1007;
-	neg.ftz.f32 	%f48, %f1011;
-	fma.rn.ftz.f32 	%f1020, %f1119, %f48, %f1012;
-	fma.rn.ftz.f32 	%f1021, %f1119, %f1010, %f1013;
-	st.global.v2.f32 	[%r28], {%f1020, %f1021};
-	add.s32 	%r29, %r28, 2048;
-	add.s32 	%r186, %r186, 16;
-	add.s32 	%r185, %r185, -1;
-	setp.ne.s32 	%p3, %r185, 0;
-	mov.u32 	%r188, %r29;
-	@%p3 bra 	BB0_5;
-
-	bar.sync 	0;
-	add.s32 	%r182, %r182, 1;
-	add.s32 	%r33, %r13, 32768;
-	setp.ne.s32 	%p4, %r182, 2;
-	mov.u32 	%r189, %r33;
-	@%p4 bra 	BB0_1;
-
-	ld.param.u32 	%r181, [complexVoltages_param_2];
-	add.s32 	%r119, %r181, %r90;
-	ld.global.v2.f32 	{%f702, %f703}, [%r119+33554432];
-	ld.global.v2.f32 	{%f708, %f709}, [%r119+35651584];
-	ld.global.v2.f32 	{%f714, %f715}, [%r119+37748736];
-	ld.global.v2.f32 	{%f720, %f721}, [%r119+39845888];
-	ld.global.v2.f32 	{%f726, %f727}, [%r119+41943040];
-	ld.global.v2.f32 	{%f732, %f733}, [%r119+44040192];
-	ld.global.v2.f32 	{%f738, %f739}, [%r119+46137344];
-	ld.global.v2.f32 	{%f744, %f745}, [%r119+48234496];
-	ld.global.v2.f32 	{%f750, %f751}, [%r119+50331648];
-	ld.global.v2.f32 	{%f756, %f757}, [%r119+52428800];
-	ld.global.v2.f32 	{%f762, %f763}, [%r119+54525952];
-	ld.global.v2.f32 	{%f768, %f769}, [%r119+56623104];
-	ld.global.v2.f32 	{%f774, %f775}, [%r119+58720256];
-	ld.global.v2.f32 	{%f780, %f781}, [%r119+60817408];
-	ld.global.v2.f32 	{%f786, %f787}, [%r119+62914560];
-	ld.global.v2.f32 	{%f792, %f793}, [%r119+65011712];
-	ld.param.u32 	%r176, [complexVoltages_param_0];
-	add.s32 	%r197, %r176, %r96;
-	mov.u32 	%r190, 0;
-
-BB0_8:
-	mov.u32 	%r195, %r197;
-	mov.u32 	%r36, %r195;
-	shl.b32 	%r38, %r190, 4;
-	// inline asm
-	mov.u32 	%r129, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r130, %tid.y;
-	// inline asm
-	shl.b32 	%r40, %r130, 1;
-	add.s32 	%r192, %r40, %r129;
-	setp.gt.u32 	%p5, %r192, 255;
-	@%p5 bra 	BB0_11;
-
-	add.s32 	%r42, %r129, %r40;
-	and.b32  	%r132, %r42, 15;
-	add.s32 	%r43, %r38, %r132;
-	mov.u32 	%r191, 0;
-
-BB0_10:
-	mov.u32 	%r45, %r192;
-	add.s32 	%r133, %r42, %r191;
-	shl.b32 	%r134, %r43, 4;
-	shl.b32 	%r135, %r10, 9;
-	shl.b32 	%r136, %r45, 4;
-	add.s32 	%r138, %r97, %r136;
-	and.b32  	%r139, %r133, 65520;
-	shl.b32 	%r140, %r139, 16;
-	ld.param.u32 	%r178, [complexVoltages_param_1];
-	add.s32 	%r141, %r140, %r178;
-	add.s32 	%r142, %r141, %r135;
-	add.s32 	%r143, %r142, %r134;
-	ld.global.v4.f32 	{%f698, %f699, %f700, %f701}, [%r143+16777216];
-	st.shared.v4.f32 	[%r138], {%f698, %f699, %f700, %f701};
-	add.s32 	%r46, %r45, 256;
-	add.s32 	%r191, %r191, 256;
-	setp.gt.u32 	%p6, %r45, -257;
-	mov.u32 	%r192, %r46;
-	@%p6 bra 	BB0_10;
-
-BB0_11:
-	mov.u32 	%r194, %r12;
-	bar.sync 	0;
-	mov.u32 	%r193, 16;
-	mov.u32 	%r196, %r36;
-
-BB0_12:
-	mov.u32 	%r51, %r196;
-	ld.shared.v2.f32 	{%f472, %f473}, [%r194+-2048];
-	ld.global.v2.f32 	{%f474, %f475}, [%r51];
-	fma.rn.ftz.f32 	%f476, %f702, %f472, %f474;
-	fma.rn.ftz.f32 	%f477, %f702, %f473, %f475;
-	neg.ftz.f32 	%f51, %f473;
-	fma.rn.ftz.f32 	%f484, %f703, %f51, %f476;
-	fma.rn.ftz.f32 	%f485, %f703, %f472, %f477;
-	ld.shared.v2.f32 	{%f488, %f489}, [%r194+-1792];
-	fma.rn.ftz.f32 	%f490, %f708, %f488, %f484;
-	fma.rn.ftz.f32 	%f491, %f708, %f489, %f485;
-	neg.ftz.f32 	%f54, %f489;
-	fma.rn.ftz.f32 	%f498, %f709, %f54, %f490;
-	fma.rn.ftz.f32 	%f499, %f709, %f488, %f491;
-	ld.shared.v2.f32 	{%f502, %f503}, [%r194+-1536];
-	fma.rn.ftz.f32 	%f504, %f714, %f502, %f498;
-	fma.rn.ftz.f32 	%f505, %f714, %f503, %f499;
-	neg.ftz.f32 	%f57, %f503;
-	fma.rn.ftz.f32 	%f512, %f715, %f57, %f504;
-	fma.rn.ftz.f32 	%f513, %f715, %f502, %f505;
-	ld.shared.v2.f32 	{%f516, %f517}, [%r194+-1280];
-	fma.rn.ftz.f32 	%f518, %f720, %f516, %f512;
-	fma.rn.ftz.f32 	%f519, %f720, %f517, %f513;
-	neg.ftz.f32 	%f60, %f517;
-	fma.rn.ftz.f32 	%f526, %f721, %f60, %f518;
-	fma.rn.ftz.f32 	%f527, %f721, %f516, %f519;
-	ld.shared.v2.f32 	{%f530, %f531}, [%r194+-1024];
-	fma.rn.ftz.f32 	%f532, %f726, %f530, %f526;
-	fma.rn.ftz.f32 	%f533, %f726, %f531, %f527;
-	neg.ftz.f32 	%f63, %f531;
-	fma.rn.ftz.f32 	%f540, %f727, %f63, %f532;
-	fma.rn.ftz.f32 	%f541, %f727, %f530, %f533;
-	ld.shared.v2.f32 	{%f544, %f545}, [%r194+-768];
-	fma.rn.ftz.f32 	%f546, %f732, %f544, %f540;
-	fma.rn.ftz.f32 	%f547, %f732, %f545, %f541;
-	neg.ftz.f32 	%f66, %f545;
-	fma.rn.ftz.f32 	%f554, %f733, %f66, %f546;
-	fma.rn.ftz.f32 	%f555, %f733, %f544, %f547;
-	ld.shared.v2.f32 	{%f558, %f559}, [%r194+-512];
-	fma.rn.ftz.f32 	%f560, %f738, %f558, %f554;
-	fma.rn.ftz.f32 	%f561, %f738, %f559, %f555;
-	neg.ftz.f32 	%f69, %f559;
-	fma.rn.ftz.f32 	%f568, %f739, %f69, %f560;
-	fma.rn.ftz.f32 	%f569, %f739, %f558, %f561;
-	ld.shared.v2.f32 	{%f572, %f573}, [%r194+-256];
-	fma.rn.ftz.f32 	%f574, %f744, %f572, %f568;
-	fma.rn.ftz.f32 	%f575, %f744, %f573, %f569;
-	neg.ftz.f32 	%f72, %f573;
-	fma.rn.ftz.f32 	%f582, %f745, %f72, %f574;
-	fma.rn.ftz.f32 	%f583, %f745, %f572, %f575;
-	ld.shared.v2.f32 	{%f586, %f587}, [%r194];
-	fma.rn.ftz.f32 	%f588, %f750, %f586, %f582;
-	fma.rn.ftz.f32 	%f589, %f750, %f587, %f583;
-	neg.ftz.f32 	%f75, %f587;
-	fma.rn.ftz.f32 	%f596, %f751, %f75, %f588;
-	fma.rn.ftz.f32 	%f597, %f751, %f586, %f589;
-	ld.shared.v2.f32 	{%f600, %f601}, [%r194+256];
-	fma.rn.ftz.f32 	%f602, %f756, %f600, %f596;
-	fma.rn.ftz.f32 	%f603, %f756, %f601, %f597;
-	neg.ftz.f32 	%f78, %f601;
-	fma.rn.ftz.f32 	%f610, %f757, %f78, %f602;
-	fma.rn.ftz.f32 	%f611, %f757, %f600, %f603;
-	ld.shared.v2.f32 	{%f614, %f615}, [%r194+512];
-	fma.rn.ftz.f32 	%f616, %f762, %f614, %f610;
-	fma.rn.ftz.f32 	%f617, %f762, %f615, %f611;
-	neg.ftz.f32 	%f81, %f615;
-	fma.rn.ftz.f32 	%f624, %f763, %f81, %f616;
-	fma.rn.ftz.f32 	%f625, %f763, %f614, %f617;
-	ld.shared.v2.f32 	{%f628, %f629}, [%r194+768];
-	fma.rn.ftz.f32 	%f630, %f768, %f628, %f624;
-	fma.rn.ftz.f32 	%f631, %f768, %f629, %f625;
-	neg.ftz.f32 	%f84, %f629;
-	fma.rn.ftz.f32 	%f638, %f769, %f84, %f630;
-	fma.rn.ftz.f32 	%f639, %f769, %f628, %f631;
-	ld.shared.v2.f32 	{%f642, %f643}, [%r194+1024];
-	fma.rn.ftz.f32 	%f644, %f774, %f642, %f638;
-	fma.rn.ftz.f32 	%f645, %f774, %f643, %f639;
-	neg.ftz.f32 	%f87, %f643;
-	fma.rn.ftz.f32 	%f652, %f775, %f87, %f644;
-	fma.rn.ftz.f32 	%f653, %f775, %f642, %f645;
-	ld.shared.v2.f32 	{%f656, %f657}, [%r194+1280];
-	fma.rn.ftz.f32 	%f658, %f780, %f656, %f652;
-	fma.rn.ftz.f32 	%f659, %f780, %f657, %f653;
-	neg.ftz.f32 	%f90, %f657;
-	fma.rn.ftz.f32 	%f666, %f781, %f90, %f658;
-	fma.rn.ftz.f32 	%f667, %f781, %f656, %f659;
-	ld.shared.v2.f32 	{%f670, %f671}, [%r194+1536];
-	fma.rn.ftz.f32 	%f672, %f786, %f670, %f666;
-	fma.rn.ftz.f32 	%f673, %f786, %f671, %f667;
-	neg.ftz.f32 	%f93, %f671;
-	fma.rn.ftz.f32 	%f680, %f787, %f93, %f672;
-	fma.rn.ftz.f32 	%f681, %f787, %f670, %f673;
-	ld.shared.v2.f32 	{%f684, %f685}, [%r194+1792];
-	fma.rn.ftz.f32 	%f686, %f792, %f684, %f680;
-	fma.rn.ftz.f32 	%f687, %f792, %f685, %f681;
-	neg.ftz.f32 	%f96, %f685;
-	fma.rn.ftz.f32 	%f694, %f793, %f96, %f686;
-	fma.rn.ftz.f32 	%f695, %f793, %f684, %f687;
-	st.global.v2.f32 	[%r51], {%f694, %f695};
-	add.s32 	%r52, %r51, 2048;
-	add.s32 	%r194, %r194, 16;
-	add.s32 	%r193, %r193, -1;
-	setp.ne.s32 	%p7, %r193, 0;
-	mov.u32 	%r196, %r52;
-	@%p7 bra 	BB0_12;
-
-	bar.sync 	0;
-	add.s32 	%r190, %r190, 1;
-	add.s32 	%r56, %r36, 32768;
-	setp.ne.s32 	%p8, %r190, 2;
-	mov.u32 	%r197, %r56;
-	@%p8 bra 	BB0_8;
-
-	ld.param.u32 	%r180, [complexVoltages_param_2];
-	add.s32 	%r149, %r180, %r90;
-	ld.global.v2.f32 	{%f376, %f377}, [%r149+67108864];
-	ld.global.v2.f32 	{%f382, %f383}, [%r149+69206016];
-	ld.global.v2.f32 	{%f388, %f389}, [%r149+71303168];
-	ld.global.v2.f32 	{%f394, %f395}, [%r149+73400320];
-	ld.global.v2.f32 	{%f400, %f401}, [%r149+75497472];
-	ld.global.v2.f32 	{%f406, %f407}, [%r149+77594624];
-	ld.global.v2.f32 	{%f412, %f413}, [%r149+79691776];
-	ld.global.v2.f32 	{%f418, %f419}, [%r149+81788928];
-	ld.global.v2.f32 	{%f424, %f425}, [%r149+83886080];
-	ld.global.v2.f32 	{%f430, %f431}, [%r149+85983232];
-	ld.global.v2.f32 	{%f436, %f437}, [%r149+88080384];
-	ld.global.v2.f32 	{%f442, %f443}, [%r149+90177536];
-	ld.global.v2.f32 	{%f448, %f449}, [%r149+92274688];
-	ld.global.v2.f32 	{%f454, %f455}, [%r149+94371840];
-	ld.global.v2.f32 	{%f460, %f461}, [%r149+96468992];
-	ld.global.v2.f32 	{%f466, %f467}, [%r149+98566144];
-	ld.param.u32 	%r175, [complexVoltages_param_0];
-	add.s32 	%r205, %r175, %r96;
-	mov.u32 	%r198, 0;
-
-BB0_15:
-	mov.u32 	%r203, %r205;
-	mov.u32 	%r59, %r203;
-	shl.b32 	%r61, %r198, 4;
-	// inline asm
-	mov.u32 	%r159, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r160, %tid.y;
-	// inline asm
-	shl.b32 	%r63, %r160, 1;
-	add.s32 	%r200, %r63, %r159;
-	setp.gt.u32 	%p9, %r200, 255;
-	@%p9 bra 	BB0_18;
-
-	add.s32 	%r65, %r159, %r63;
-	and.b32  	%r162, %r65, 15;
-	add.s32 	%r66, %r61, %r162;
-	mov.u32 	%r199, 0;
-
-BB0_17:
-	mov.u32 	%r68, %r200;
-	add.s32 	%r163, %r65, %r199;
-	shl.b32 	%r164, %r66, 4;
-	shl.b32 	%r165, %r10, 9;
-	shl.b32 	%r166, %r68, 4;
-	add.s32 	%r168, %r97, %r166;
-	and.b32  	%r169, %r163, 65520;
-	shl.b32 	%r170, %r169, 16;
-	ld.param.u32 	%r177, [complexVoltages_param_1];
-	add.s32 	%r171, %r170, %r177;
-	add.s32 	%r172, %r171, %r165;
-	add.s32 	%r173, %r172, %r164;
-	ld.global.v4.f32 	{%f372, %f373, %f374, %f375}, [%r173+33554432];
-	st.shared.v4.f32 	[%r168], {%f372, %f373, %f374, %f375};
-	add.s32 	%r69, %r68, 256;
-	add.s32 	%r199, %r199, 256;
-	setp.gt.u32 	%p10, %r68, -257;
-	mov.u32 	%r200, %r69;
-	@%p10 bra 	BB0_17;
-
-BB0_18:
-	mov.u32 	%r202, %r12;
-	bar.sync 	0;
-	mov.u32 	%r201, 16;
-	mov.u32 	%r204, %r59;
-
-BB0_19:
-	mov.u32 	%r74, %r204;
-	ld.shared.v2.f32 	{%f146, %f147}, [%r202+-2048];
-	ld.global.v2.f32 	{%f148, %f149}, [%r74];
-	fma.rn.ftz.f32 	%f150, %f376, %f146, %f148;
-	fma.rn.ftz.f32 	%f151, %f376, %f147, %f149;
-	neg.ftz.f32 	%f99, %f147;
-	fma.rn.ftz.f32 	%f158, %f377, %f99, %f150;
-	fma.rn.ftz.f32 	%f159, %f377, %f146, %f151;
-	ld.shared.v2.f32 	{%f162, %f163}, [%r202+-1792];
-	fma.rn.ftz.f32 	%f164, %f382, %f162, %f158;
-	fma.rn.ftz.f32 	%f165, %f382, %f163, %f159;
-	neg.ftz.f32 	%f102, %f163;
-	fma.rn.ftz.f32 	%f172, %f383, %f102, %f164;
-	fma.rn.ftz.f32 	%f173, %f383, %f162, %f165;
-	ld.shared.v2.f32 	{%f176, %f177}, [%r202+-1536];
-	fma.rn.ftz.f32 	%f178, %f388, %f176, %f172;
-	fma.rn.ftz.f32 	%f179, %f388, %f177, %f173;
-	neg.ftz.f32 	%f105, %f177;
-	fma.rn.ftz.f32 	%f186, %f389, %f105, %f178;
-	fma.rn.ftz.f32 	%f187, %f389, %f176, %f179;
-	ld.shared.v2.f32 	{%f190, %f191}, [%r202+-1280];
-	fma.rn.ftz.f32 	%f192, %f394, %f190, %f186;
-	fma.rn.ftz.f32 	%f193, %f394, %f191, %f187;
-	neg.ftz.f32 	%f108, %f191;
-	fma.rn.ftz.f32 	%f200, %f395, %f108, %f192;
-	fma.rn.ftz.f32 	%f201, %f395, %f190, %f193;
-	ld.shared.v2.f32 	{%f204, %f205}, [%r202+-1024];
-	fma.rn.ftz.f32 	%f206, %f400, %f204, %f200;
-	fma.rn.ftz.f32 	%f207, %f400, %f205, %f201;
-	neg.ftz.f32 	%f111, %f205;
-	fma.rn.ftz.f32 	%f214, %f401, %f111, %f206;
-	fma.rn.ftz.f32 	%f215, %f401, %f204, %f207;
-	ld.shared.v2.f32 	{%f218, %f219}, [%r202+-768];
-	fma.rn.ftz.f32 	%f220, %f406, %f218, %f214;
-	fma.rn.ftz.f32 	%f221, %f406, %f219, %f215;
-	neg.ftz.f32 	%f114, %f219;
-	fma.rn.ftz.f32 	%f228, %f407, %f114, %f220;
-	fma.rn.ftz.f32 	%f229, %f407, %f218, %f221;
-	ld.shared.v2.f32 	{%f232, %f233}, [%r202+-512];
-	fma.rn.ftz.f32 	%f234, %f412, %f232, %f228;
-	fma.rn.ftz.f32 	%f235, %f412, %f233, %f229;
-	neg.ftz.f32 	%f117, %f233;
-	fma.rn.ftz.f32 	%f242, %f413, %f117, %f234;
-	fma.rn.ftz.f32 	%f243, %f413, %f232, %f235;
-	ld.shared.v2.f32 	{%f246, %f247}, [%r202+-256];
-	fma.rn.ftz.f32 	%f248, %f418, %f246, %f242;
-	fma.rn.ftz.f32 	%f249, %f418, %f247, %f243;
-	neg.ftz.f32 	%f120, %f247;
-	fma.rn.ftz.f32 	%f256, %f419, %f120, %f248;
-	fma.rn.ftz.f32 	%f257, %f419, %f246, %f249;
-	ld.shared.v2.f32 	{%f260, %f261}, [%r202];
-	fma.rn.ftz.f32 	%f262, %f424, %f260, %f256;
-	fma.rn.ftz.f32 	%f263, %f424, %f261, %f257;
-	neg.ftz.f32 	%f123, %f261;
-	fma.rn.ftz.f32 	%f270, %f425, %f123, %f262;
-	fma.rn.ftz.f32 	%f271, %f425, %f260, %f263;
-	ld.shared.v2.f32 	{%f274, %f275}, [%r202+256];
-	fma.rn.ftz.f32 	%f276, %f430, %f274, %f270;
-	fma.rn.ftz.f32 	%f277, %f430, %f275, %f271;
-	neg.ftz.f32 	%f126, %f275;
-	fma.rn.ftz.f32 	%f284, %f431, %f126, %f276;
-	fma.rn.ftz.f32 	%f285, %f431, %f274, %f277;
-	ld.shared.v2.f32 	{%f288, %f289}, [%r202+512];
-	fma.rn.ftz.f32 	%f290, %f436, %f288, %f284;
-	fma.rn.ftz.f32 	%f291, %f436, %f289, %f285;
-	neg.ftz.f32 	%f129, %f289;
-	fma.rn.ftz.f32 	%f298, %f437, %f129, %f290;
-	fma.rn.ftz.f32 	%f299, %f437, %f288, %f291;
-	ld.shared.v2.f32 	{%f302, %f303}, [%r202+768];
-	fma.rn.ftz.f32 	%f304, %f442, %f302, %f298;
-	fma.rn.ftz.f32 	%f305, %f442, %f303, %f299;
-	neg.ftz.f32 	%f132, %f303;
-	fma.rn.ftz.f32 	%f312, %f443, %f132, %f304;
-	fma.rn.ftz.f32 	%f313, %f443, %f302, %f305;
-	ld.shared.v2.f32 	{%f316, %f317}, [%r202+1024];
-	fma.rn.ftz.f32 	%f318, %f448, %f316, %f312;
-	fma.rn.ftz.f32 	%f319, %f448, %f317, %f313;
-	neg.ftz.f32 	%f135, %f317;
-	fma.rn.ftz.f32 	%f326, %f449, %f135, %f318;
-	fma.rn.ftz.f32 	%f327, %f449, %f316, %f319;
-	ld.shared.v2.f32 	{%f330, %f331}, [%r202+1280];
-	fma.rn.ftz.f32 	%f332, %f454, %f330, %f326;
-	fma.rn.ftz.f32 	%f333, %f454, %f331, %f327;
-	neg.ftz.f32 	%f138, %f331;
-	fma.rn.ftz.f32 	%f340, %f455, %f138, %f332;
-	fma.rn.ftz.f32 	%f341, %f455, %f330, %f333;
-	ld.shared.v2.f32 	{%f344, %f345}, [%r202+1536];
-	fma.rn.ftz.f32 	%f346, %f460, %f344, %f340;
-	fma.rn.ftz.f32 	%f347, %f460, %f345, %f341;
-	neg.ftz.f32 	%f141, %f345;
-	fma.rn.ftz.f32 	%f354, %f461, %f141, %f346;
-	fma.rn.ftz.f32 	%f355, %f461, %f344, %f347;
-	ld.shared.v2.f32 	{%f358, %f359}, [%r202+1792];
-	fma.rn.ftz.f32 	%f360, %f466, %f358, %f354;
-	fma.rn.ftz.f32 	%f361, %f466, %f359, %f355;
-	neg.ftz.f32 	%f144, %f359;
-	fma.rn.ftz.f32 	%f368, %f467, %f144, %f360;
-	fma.rn.ftz.f32 	%f369, %f467, %f358, %f361;
-	st.global.v2.f32 	[%r74], {%f368, %f369};
-	add.s32 	%r75, %r74, 2048;
-	add.s32 	%r202, %r202, 16;
-	add.s32 	%r201, %r201, -1;
-	setp.ne.s32 	%p11, %r201, 0;
-	mov.u32 	%r204, %r75;
-	@%p11 bra 	BB0_19;
-
-	bar.sync 	0;
-	add.s32 	%r198, %r198, 1;
-	add.s32 	%r79, %r59, 32768;
-	setp.ne.s32 	%p12, %r198, 2;
-	mov.u32 	%r205, %r79;
-	@%p12 bra 	BB0_15;
-
-	ret;
-}
-
-
diff --git a/RTCP/GPUProc/src/BeamFormer/CoherentStokes.cl-0.ptx b/RTCP/GPUProc/src/BeamFormer/CoherentStokes.cl-0.ptx
deleted file mode 100644
index 95682a21f13f73a46e51290282c947066bcf9e94..0000000000000000000000000000000000000000
Binary files a/RTCP/GPUProc/src/BeamFormer/CoherentStokes.cl-0.ptx and /dev/null differ
diff --git a/RTCP/GPUProc/src/BeamFormer/Dedispersion.cl-0.ptx b/RTCP/GPUProc/src/BeamFormer/Dedispersion.cl-0.ptx
deleted file mode 100644
index 221a542fb5fdfc1737730694aaf0921d00774dfb..0000000000000000000000000000000000000000
--- a/RTCP/GPUProc/src/BeamFormer/Dedispersion.cl-0.ptx
+++ /dev/null
@@ -1,417 +0,0 @@
-//
-// Generated by NVIDIA NVVM Compiler
-// Compiler built on Sat Sep 29 10:12:13 2012 (1348906333)
-// Driver 304.54
-//
-
-.version 3.0
-.target sm_30, texmode_independent
-.address_size 32
-
-.extern .shared .align 4 .b8 shr_2_local_DMs[512];
-
-.entry applyChirp(
-	.param .u32 .ptr .global .align 1 applyChirp_param_0,
-	.param .u32 .ptr .global .align 4 applyChirp_param_1,
-	.param .f32 applyChirp_param_2
-)
-{
-	.reg .f32 	%f<225>;
-	.reg .pred 	%p<26>;
-	.reg .s32 	%r<97>;
-
-
-	// inline asm
-	mov.u32 	%r32, %tid.x;
-	// inline asm
-	setp.gt.s32 	%p4, %r32, 127;
-	mov.u32 	%r92, %r32;
-	@%p4 bra 	BB0_2;
-
-BB0_1:
-	shl.b32 	%r34, %r92, 2;
-	ld.param.u32 	%r91, [applyChirp_param_1];
-	add.s32 	%r35, %r91, %r34;
-	ld.global.f32 	%f29, [%r35];
-	add.ftz.f32 	%f30, %f29, %f29;
-	mul.ftz.f32 	%f31, %f30, 0f40490FDB;
-	mul.ftz.f32 	%f32, %f31, 0f596BD7E5;
-	mov.u32 	%r36, shr_2_local_DMs;
-	add.s32 	%r37, %r36, %r34;
-	st.shared.f32 	[%r37], %f32;
-	// inline asm
-	mov.u32 	%r33, %ntid.x;
-	// inline asm
-	add.s32 	%r92, %r33, %r92;
-	setp.lt.s32 	%p5, %r92, 128;
-	@%p5 bra 	BB0_1;
-
-BB0_2:
-	bar.sync 	0;
-	// inline asm
-	mov.u32 	%r38, %envreg3;
-	// inline asm
-	// inline asm
-	mov.u32 	%r39, %ntid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r40, %ctaid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r41, %tid.x;
-	// inline asm
-	add.s32 	%r50, %r41, %r38;
-	mad.lo.s32 	%r51, %r40, %r39, %r50;
-	// inline asm
-	mov.u32 	%r42, %envreg4;
-	// inline asm
-	// inline asm
-	mov.u32 	%r43, %ntid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r44, %ctaid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r45, %tid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r46, %envreg5;
-	// inline asm
-	// inline asm
-	mov.u32 	%r47, %ntid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r48, %ctaid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r49, %tid.z;
-	// inline asm
-	add.s32 	%r52, %r49, %r46;
-	mad.lo.s32 	%r53, %r48, %r47, %r52;
-	cvt.rn.f32.u32 	%f35, %r53;
-	ld.param.f32 	%f217, [applyChirp_param_2];
-	add.ftz.f32 	%f36, %f217, 0fC7BEBC20;
-	fma.rn.ftz.f32 	%f2, %f35, 0f42BEBC20, %f36;
-	cvt.rn.f32.u32 	%f37, %r51;
-	mul.ftz.f32 	%f38, %f37, 0f403EBC20;
-	setp.gt.u32 	%p6, %r51, 32;
-	fma.rn.ftz.f32 	%f39, %f37, 0f403EBC20, 0fC2BEBC20;
-	selp.f32 	%f3, %f39, %f38, %p6;
-	div.rn.ftz.f32 	%f34, %f3, 0f42334A70;
-	// inline asm
-	abs.f32 	%f33, %f34;
-	// inline asm
-	setp.eq.ftz.f32 	%p7, %f34, 0f3F800000;
-	@%p7 bra 	BB0_25;
-
-	setp.nan.ftz.f32 	%p8, %f34, %f34;
-	@%p8 bra 	BB0_24;
-
-	mov.f32 	%f6, 0fFF800000;
-	mov.f32 	%f220, 0f42A00000;
-	mov.f32 	%f44, 0f3F000000;
-	mul.rn.f32 	%f41, %f44, %f220;
-	// inline asm
-	cvt.rmi.f32.f32 	%f40, %f41;
-	// inline asm
-	mov.f32 	%f45, 0f40000000;
-	mul.rn.f32 	%f46, %f45, %f40;
-	sub.ftz.f32 	%f47, %f220, %f46;
-	setp.eq.ftz.f32 	%p1, %f47, 0f3F800000;
-	// inline asm
-	cvt.rzi.f32.f32 	%f42, %f220;
-	// inline asm
-	setp.eq.ftz.f32 	%p2, %f42, 0f42A00000;
-	and.pred  	%p3, %p1, %p2;
-	setp.eq.ftz.f32 	%p9, %f33, 0f00000000;
-	@%p9 bra 	BB0_23;
-
-	setp.eq.ftz.f32 	%p10, %f34, 0f7F800000;
-	setp.eq.ftz.f32 	%p11, %f34, %f6;
-	or.pred  	%p12, %p10, %p11;
-	@%p12 bra 	BB0_20;
-
-	setp.geu.ftz.f32 	%p13, %f34, 0f00000000;
-	@%p13 bra 	BB0_8;
-
-	mov.f32 	%f49, 0f42A00000;
-	// inline asm
-	cvt.rzi.f32.f32 	%f48, %f49;
-	// inline asm
-	setp.neu.ftz.f32 	%p14, %f48, 0f42A00000;
-	@%p14 bra 	BB0_19;
-
-BB0_8:
-	// inline asm
-	abs.f32 	%f50, %f34;
-	// inline asm
-	mov.b32 	 %r18, %f50;
-	shr.u32 	%r54, %r18, 23;
-	and.b32  	%r55, %r54, 255;
-	add.s32 	%r93, %r55, -127;
-	setp.eq.s32 	%p15, %r55, 0;
-	mov.f32 	%f218, %f50;
-	@%p15 bra 	BB0_9;
-	bra.uni 	BB0_10;
-
-BB0_9:
-	and.b32  	%r56, %r18, -2139095041;
-	or.b32  	%r57, %r56, 1065353216;
-	mov.b32 	 %f52, %r57;
-	add.ftz.f32 	%f53, %f52, 0fBF800000;
-	mov.b32 	 %r58, %f53;
-	shr.u32 	%r59, %r58, 23;
-	and.b32  	%r60, %r59, 255;
-	add.s32 	%r93, %r60, -253;
-	and.b32  	%r61, %r58, -2139095041;
-	or.b32  	%r62, %r61, 1065353216;
-	mov.b32 	 %f218, %r62;
-
-BB0_10:
-	mov.b32 	 %r63, %f218;
-	and.b32  	%r64, %r63, -2139095041;
-	or.b32  	%r65, %r64, 1065353216;
-	mov.b32 	 %f219, %r65;
-	setp.gt.ftz.f32 	%p16, %f219, 0f3FB504F3;
-	@%p16 bra 	BB0_11;
-	bra.uni 	BB0_12;
-
-BB0_11:
-	mul.rn.f32 	%f219, %f219, %f44;
-	add.s32 	%r93, %r93, 1;
-
-BB0_12:
-	add.ftz.f32 	%f63, %f219, 0f3F800000;
-	rcp.approx.ftz.f32 	%f57, %f63;
-	add.ftz.f32 	%f56, %f219, 0fBF800000;
-	// inline asm
-	mul.rz.f32 	%f55, %f56, %f57;
-	// inline asm
-	mul.rn.f32 	%f65, %f45, %f55;
-	mul.rn.f32 	%f66, %f65, %f65;
-	mov.f32 	%f67, 0f3B18F0FE;
-	mul.rn.f32 	%f68, %f67, %f66;
-	add.ftz.f32 	%f69, %f68, 0f3C4CAF63;
-	mul.rn.f32 	%f70, %f69, %f66;
-	add.ftz.f32 	%f71, %f70, 0f3DAAAABD;
-	mul.rn.f32 	%f72, %f71, %f66;
-	mul.rn.f32 	%f60, %f72, %f65;
-	mov.b32 	 %r66, %f65;
-	and.b32  	%r67, %r66, -4096;
-	mov.b32 	 %f73, %r67;
-	mov.b32 	 %r68, %f56;
-	and.b32  	%r69, %r68, -4096;
-	mov.b32 	 %f74, %r69;
-	sub.ftz.f32 	%f75, %f56, %f73;
-	mul.rn.f32 	%f76, %f45, %f75;
-	sub.ftz.f32 	%f77, %f56, %f74;
-	mul.rn.f32 	%f78, %f73, %f74;
-	sub.ftz.f32 	%f79, %f76, %f78;
-	mul.rn.f32 	%f80, %f73, %f77;
-	sub.ftz.f32 	%f81, %f79, %f80;
-	mul.rn.f32 	%f82, %f57, %f81;
-	add.ftz.f32 	%f83, %f73, %f82;
-	sub.ftz.f32 	%f84, %f83, %f73;
-	sub.ftz.f32 	%f85, %f82, %f84;
-	add.ftz.f32 	%f86, %f83, %f60;
-	neg.ftz.f32 	%f59, %f60;
-	// inline asm
-	add.rz.f32 	%f58, %f59, %f60;
-	// inline asm
-	add.ftz.f32 	%f87, %f58, %f85;
-	add.ftz.f32 	%f88, %f86, %f87;
-	sub.ftz.f32 	%f89, %f87, %f87;
-	cvt.rn.f32.s32 	%f90, %r93;
-	mov.f32 	%f91, 0f3F317200;
-	mul.rn.f32 	%f92, %f90, %f91;
-	mov.f32 	%f93, 0f35BFBE8E;
-	mul.rn.f32 	%f94, %f90, %f93;
-	add.ftz.f32 	%f95, %f92, %f88;
-	sub.ftz.f32 	%f96, %f88, %f88;
-	add.ftz.f32 	%f97, %f96, %f89;
-	add.ftz.f32 	%f98, %f97, %f94;
-	add.ftz.f32 	%f13, %f95, %f98;
-	sub.ftz.f32 	%f14, %f98, %f98;
-	// inline asm
-	abs.f32 	%f61, %f220;
-	// inline asm
-	setp.gt.ftz.f32 	%p17, %f61, 0f77F684DF;
-	@%p17 bra 	BB0_13;
-	bra.uni 	BB0_14;
-
-BB0_13:
-	mov.f32 	%f100, 0f39000000;
-	mov.f32 	%f101, 0f42A00000;
-	mul.rn.f32 	%f220, %f101, %f100;
-
-BB0_14:
-	mov.f32 	%f102, 0f45800800;
-	mul.rn.f32 	%f103, %f13, %f102;
-	sub.ftz.f32 	%f104, %f13, %f103;
-	add.ftz.f32 	%f105, %f104, %f103;
-	sub.ftz.f32 	%f106, %f13, %f105;
-	mul.rn.f32 	%f107, %f220, %f102;
-	sub.ftz.f32 	%f108, %f220, %f107;
-	add.ftz.f32 	%f109, %f108, %f107;
-	sub.ftz.f32 	%f110, %f220, %f109;
-	mul.rn.f32 	%f111, %f105, %f109;
-	mul.rn.f32 	%f112, %f13, %f220;
-	sub.ftz.f32 	%f113, %f111, %f112;
-	mul.rn.f32 	%f114, %f105, %f110;
-	add.ftz.f32 	%f115, %f113, %f114;
-	mul.rn.f32 	%f116, %f106, %f109;
-	add.ftz.f32 	%f117, %f115, %f116;
-	mul.rn.f32 	%f118, %f106, %f110;
-	add.ftz.f32 	%f119, %f117, %f118;
-	mul.rn.f32 	%f120, %f14, %f220;
-	add.ftz.f32 	%f121, %f120, %f119;
-	add.ftz.f32 	%f122, %f112, %f121;
-	sub.ftz.f32 	%f17, %f121, %f121;
-	mov.f32 	%f223, %f17;
-	mov.f32 	%f224, %f122;
-	mov.b32 	 %r24, %f122;
-	setp.eq.s32 	%p18, %r24, 1118925336;
-	@%p18 bra 	BB0_15;
-	bra.uni 	BB0_16;
-
-BB0_15:
-	add.s32 	%r70, %r24, -1;
-	mov.b32 	 %f123, %r70;
-	add.ftz.f32 	%f124, %f17, 0f37000000;
-	mov.f32 	%f223, %f124;
-	mov.f32 	%f224, %f123;
-
-BB0_16:
-	// inline asm
-	mul.f32 	%f125, %f224, 0f3FB8AA3B;ex2.approx.f32 	%f125, %f125;
-	// inline asm
-	setp.neu.ftz.f32 	%p19, %f125, 0f7F800000;
-	mov.f32 	%f221, %f125;
-	@%p19 bra 	BB0_17;
-	bra.uni 	BB0_18;
-
-BB0_17:
-	// inline asm
-	mad.f32 	%f127, %f125, %f223, %f125;
-	// inline asm
-	mov.f32 	%f221, %f127;
-
-BB0_18:
-	not.pred 	%p21, %p3;
-	or.pred  	%p23, %p13, %p21;
-	mov.b32 	 %r71, %f221;
-	xor.b32  	%r72, %r71, -2147483648;
-	mov.b32 	 %f131, %r72;
-	selp.f32 	%f222, %f221, %f131, %p23;
-	bra.uni 	BB0_26;
-
-BB0_19:
-	mov.f32 	%f222, 0f7FFFFFFF;
-	bra.uni 	BB0_26;
-
-BB0_20:
-	mov.b32 	 %r73, %f34;
-	setp.lt.s32 	%p24, %r73, 0;
-	@%p24 bra 	BB0_22;
-
-	mov.f32 	%f222, 0f7F800000;
-	bra.uni 	BB0_26;
-
-BB0_22:
-	selp.f32 	%f222, 0fFF800000, 0f7F800000, %p3;
-	bra.uni 	BB0_26;
-
-BB0_23:
-	mov.b32 	 %r74, %f34;
-	and.b32  	%r75, %r74, -2147483648;
-	mov.b32 	 %f132, %r75;
-	selp.f32 	%f222, %f132, 0f00000000, %p3;
-	bra.uni 	BB0_26;
-
-BB0_24:
-	add.ftz.f32 	%f222, %f34, 0f42A00000;
-	bra.uni 	BB0_26;
-
-BB0_25:
-	mov.f32 	%f222, 0f3F800000;
-
-BB0_26:
-	add.ftz.f32 	%f135, %f222, 0f3F800000;
-	// inline asm
-	rsqrt.approx.f32 	%f134, %f135;
-	// inline asm
-	mul.ftz.f32 	%f136, %f134, 0f42000000;
-	div.approx.ftz.f32 	%f137, %f3, %f2;
-	mul.ftz.f32 	%f138, %f137, %f137;
-	add.ftz.f32 	%f139, %f2, %f3;
-	div.approx.ftz.f32 	%f28, %f138, %f139;
-	add.s32 	%r78, %r45, %r42;
-	mad.lo.s32 	%r79, %r44, %r43, %r78;
-	shl.b32 	%r80, %r79, 8;
-	shl.b32 	%r83, %r53, 8;
-	add.s32 	%r84, %r80, %r83;
-	shl.b32 	%r87, %r51, 3;
-	add.s32 	%r88, %r84, %r87;
-	ld.param.u32 	%r90, [applyChirp_param_0];
-	add.s32 	%r89, %r88, %r90;
-	add.s32 	%r95, %r89, 1048576;
-	mov.u32 	%r96, 128;
-	mov.u32 	%r94, shr_2_local_DMs;
-
-BB0_27:
-	ld.global.v2.f32 	{%f181, %f182}, [%r95+-1048576];
-	ld.global.v2.f32 	{%f183, %f184}, [%r95+-524288];
-	ld.shared.f32 	%f149, [%r94];
-	mul.ftz.f32 	%f144, %f149, %f28;
-	// inline asm
-	cos.approx.f32 	%f141, %f144;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f143, %f144;
-	// inline asm
-	mul.ftz.f32 	%f187, %f141, %f136;
-	mul.ftz.f32 	%f188, %f143, %f136;
-	mul.ftz.f32 	%f152, %f187, %f181;
-	neg.f32 	%f155, %f188;
-	fma.rn.ftz.f32 	%f156, %f155, %f182, %f152;
-	mul.ftz.f32 	%f157, %f188, %f181;
-	fma.rn.ftz.f32 	%f158, %f187, %f182, %f157;
-	st.global.v2.f32 	[%r95+-1048576], {%f156, %f158};
-	mul.ftz.f32 	%f160, %f187, %f183;
-	fma.rn.ftz.f32 	%f162, %f155, %f184, %f160;
-	mul.ftz.f32 	%f163, %f188, %f183;
-	fma.rn.ftz.f32 	%f164, %f187, %f184, %f163;
-	st.global.v2.f32 	[%r95+-524288], {%f162, %f164};
-	ld.global.v2.f32 	{%f195, %f196}, [%r95];
-	ld.global.v2.f32 	{%f197, %f198}, [%r95+524288];
-	ld.shared.f32 	%f165, [%r94+4];
-	mul.ftz.f32 	%f148, %f165, %f28;
-	// inline asm
-	cos.approx.f32 	%f145, %f148;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f147, %f148;
-	// inline asm
-	mul.ftz.f32 	%f201, %f145, %f136;
-	mul.ftz.f32 	%f202, %f147, %f136;
-	mul.ftz.f32 	%f168, %f201, %f195;
-	neg.f32 	%f171, %f202;
-	fma.rn.ftz.f32 	%f172, %f171, %f196, %f168;
-	mul.ftz.f32 	%f173, %f202, %f195;
-	fma.rn.ftz.f32 	%f174, %f201, %f196, %f173;
-	st.global.v2.f32 	[%r95], {%f172, %f174};
-	mul.ftz.f32 	%f176, %f201, %f197;
-	fma.rn.ftz.f32 	%f178, %f171, %f198, %f176;
-	mul.ftz.f32 	%f179, %f202, %f197;
-	fma.rn.ftz.f32 	%f180, %f201, %f198, %f179;
-	st.global.v2.f32 	[%r95+524288], {%f178, %f180};
-	add.s32 	%r95, %r95, 2097152;
-	add.s32 	%r94, %r94, 8;
-	add.s32 	%r96, %r96, -2;
-	setp.ne.s32 	%p25, %r96, 0;
-	@%p25 bra 	BB0_27;
-
-	ret;
-}
-
-
diff --git a/RTCP/GPUProc/src/BeamFormer/IncoherentStokes.cl-0.ptx b/RTCP/GPUProc/src/BeamFormer/IncoherentStokes.cl-0.ptx
deleted file mode 100644
index be0613c10bbe025940d0054bfd9e8579149984e3..0000000000000000000000000000000000000000
Binary files a/RTCP/GPUProc/src/BeamFormer/IncoherentStokes.cl-0.ptx and /dev/null differ
diff --git a/RTCP/GPUProc/src/BeamFormer/IntToFloat.cl-0.ptx b/RTCP/GPUProc/src/BeamFormer/IntToFloat.cl-0.ptx
deleted file mode 100644
index ce5c60b9ac9b08ef14ba9402e30163b4bede74bb..0000000000000000000000000000000000000000
--- a/RTCP/GPUProc/src/BeamFormer/IntToFloat.cl-0.ptx
+++ /dev/null
@@ -1,82 +0,0 @@
-//
-// Generated by NVIDIA NVVM Compiler
-// Compiler built on Sat Sep 29 10:12:13 2012 (1348906333)
-// Driver 304.54
-//
-
-.version 3.0
-.target sm_30, texmode_independent
-.address_size 32
-
-
-.entry intToFloat(
-	.param .u32 .ptr .global .align 1 intToFloat_param_0,
-	.param .u32 .ptr .global .align 1 intToFloat_param_1
-)
-{
-	.reg .f32 	%f<9>;
-	.reg .pred 	%p<3>;
-	.reg .s32 	%r<25>;
-	.reg .s16 	%rc<9>;
-
-
-	// inline asm
-	mov.u32 	%r7, %envreg4;
-	// inline asm
-	// inline asm
-	mov.u32 	%r8, %ntid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r9, %ctaid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r10, %tid.y;
-	// inline asm
-	add.s32 	%r12, %r10, %r7;
-	mad.lo.s32 	%r3, %r9, %r8, %r12;
-	// inline asm
-	mov.u32 	%r11, %tid.x;
-	// inline asm
-	setp.gt.u32 	%p1, %r11, 65535;
-	mov.u32 	%r24, %r11;
-	@%p1 bra 	BB0_2;
-
-BB0_1:
-	shl.b32 	%r14, %r3, 18;
-	ld.param.u32 	%r23, [intToFloat_param_1];
-	add.s32 	%r15, %r23, %r14;
-	shl.b32 	%r16, %r24, 2;
-	add.s32 	%r17, %r15, %r16;
-	ld.global.v2.u8 	{%rc5, %rc6}, [%r17];
-	// inline asm
-	cvt.rn.f32.s8 	%f1, %rc5;
-	// inline asm
-	// inline asm
-	cvt.rn.f32.s8 	%f2, %rc6;
-	// inline asm
-	shl.b32 	%r18, %r3, 20;
-	ld.param.u32 	%r22, [intToFloat_param_0];
-	add.s32 	%r19, %r22, %r18;
-	shl.b32 	%r20, %r24, 3;
-	add.s32 	%r21, %r19, %r20;
-	st.global.v2.f32 	[%r21], {%f1, %f2};
-	ld.global.v2.u8 	{%rc7, %rc8}, [%r17+2];
-	// inline asm
-	cvt.rn.f32.s8 	%f3, %rc7;
-	// inline asm
-	// inline asm
-	cvt.rn.f32.s8 	%f4, %rc8;
-	// inline asm
-	st.global.v2.f32 	[%r21+524288], {%f3, %f4};
-	// inline asm
-	mov.u32 	%r13, %ntid.x;
-	// inline asm
-	add.s32 	%r24, %r13, %r24;
-	setp.lt.u32 	%p2, %r24, 65536;
-	@%p2 bra 	BB0_1;
-
-BB0_2:
-	ret;
-}
-
-
diff --git a/RTCP/GPUProc/src/BeamFormer/Transpose.cl-0.ptx b/RTCP/GPUProc/src/BeamFormer/Transpose.cl-0.ptx
deleted file mode 100644
index 26a5c9cca9b954543c29abfbb6bb29d45835a803..0000000000000000000000000000000000000000
--- a/RTCP/GPUProc/src/BeamFormer/Transpose.cl-0.ptx
+++ /dev/null
@@ -1,134 +0,0 @@
-//
-// Generated by NVIDIA NVVM Compiler
-// Compiler built on Sat Sep 29 10:12:13 2012 (1348906333)
-// Driver 304.54
-//
-
-.version 3.0
-.target sm_30, texmode_independent
-.address_size 32
-
-.extern .shared .align 16 .b8 shr_3_tmp[4352];
-
-.entry transposeComplexVoltages(
-	.param .u32 .ptr .global .align 1 transposeComplexVoltages_param_0,
-	.param .u32 .ptr .global .align 1 transposeComplexVoltages_param_1
-)
-{
-	.reg .f32 	%f<65>;
-	.reg .pred 	%p<2>;
-	.reg .s32 	%r<58>;
-
-
-	ld.param.u32 	%r26, [transposeComplexVoltages_param_0];
-	ld.param.u32 	%r27, [transposeComplexVoltages_param_1];
-	// inline asm
-	mov.u32 	%r13, %envreg4;
-	// inline asm
-	// inline asm
-	mov.u32 	%r14, %ntid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r15, %ctaid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r16, %tid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r17, %envreg5;
-	// inline asm
-	// inline asm
-	mov.u32 	%r18, %ntid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r19, %ctaid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r20, %tid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r21, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r22, %tid.x;
-	// inline asm
-	shr.u32 	%r28, %r22, 4;
-	// inline asm
-	mov.u32 	%r23, %tid.x;
-	// inline asm
-	shr.u32 	%r29, %r23, 4;
-	// inline asm
-	mov.u32 	%r24, %tid.x;
-	// inline asm
-	and.b32  	%r30, %r21, 15;
-	mov.u32 	%r31, shr_3_tmp;
-	mad.lo.s32 	%r32, %r30, 272, %r31;
-	and.b32  	%r33, %r22, -16;
-	add.s32 	%r1, %r32, %r33;
-	mad.lo.s32 	%r34, %r29, 272, %r31;
-	and.b32  	%r35, %r24, 15;
-	shl.b32 	%r36, %r35, 4;
-	add.s32 	%r2, %r34, %r36;
-	add.s32 	%r37, %r20, %r17;
-	mad.lo.s32 	%r38, %r19, %r18, %r37;
-	shl.b32 	%r39, %r38, 4;
-	add.s32 	%r40, %r28, %r39;
-	shl.b32 	%r41, %r40, 16;
-	add.s32 	%r42, %r16, %r13;
-	mad.lo.s32 	%r43, %r15, %r14, %r42;
-	shl.b32 	%r44, %r43, 4;
-	add.s32 	%r45, %r44, %r30;
-	shl.b32 	%r46, %r45, 4;
-	add.s32 	%r47, %r41, %r46;
-	add.s32 	%r48, %r47, %r27;
-	add.s32 	%r56, %r48, 4096;
-	add.s32 	%r49, %r29, %r44;
-	shl.b32 	%r50, %r49, 20;
-	add.s32 	%r51, %r39, %r35;
-	shl.b32 	%r52, %r51, 3;
-	add.s32 	%r53, %r50, %r52;
-	add.s32 	%r54, %r53, %r26;
-	add.s32 	%r55, %r54, 525056;
-	mov.u32 	%r57, 2048;
-
-BB0_1:
-	add.s32 	%r8, %r56, -4096;
-	ld.global.v4.f32 	{%f61, %f62, %f63, %f64}, [%r56+-4096];
-	st.shared.v4.f32 	[%r1], {%f61, %f62, %f63, %f64};
-	bar.sync 	0;
-	ld.shared.v4.f32 	{%f53, %f54, %f55, %f56}, [%r2];
-	add.s32 	%r9, %r55, -525056;
-	st.global.v2.f32 	[%r55+-525056], {%f53, %f54};
-	st.global.v2.f32 	[%r55+-768], {%f55, %f56};
-	bar.sync 	0;
-	ld.global.v4.f32 	{%f49, %f50, %f51, %f52}, [%r8+2048];
-	st.shared.v4.f32 	[%r1], {%f49, %f50, %f51, %f52};
-	bar.sync 	0;
-	ld.shared.v4.f32 	{%f41, %f42, %f43, %f44}, [%r2];
-	st.global.v2.f32 	[%r9+256], {%f41, %f42};
-	st.global.v2.f32 	[%r9+524544], {%f43, %f44};
-	bar.sync 	0;
-	ld.global.v4.f32 	{%f37, %f38, %f39, %f40}, [%r8+4096];
-	st.shared.v4.f32 	[%r1], {%f37, %f38, %f39, %f40};
-	bar.sync 	0;
-	ld.shared.v4.f32 	{%f29, %f30, %f31, %f32}, [%r2];
-	st.global.v2.f32 	[%r9+512], {%f29, %f30};
-	st.global.v2.f32 	[%r9+524800], {%f31, %f32};
-	bar.sync 	0;
-	ld.global.v4.f32 	{%f25, %f26, %f27, %f28}, [%r8+6144];
-	st.shared.v4.f32 	[%r1], {%f25, %f26, %f27, %f28};
-	bar.sync 	0;
-	ld.shared.v4.f32 	{%f17, %f18, %f19, %f20}, [%r2];
-	st.global.v2.f32 	[%r9+768], {%f17, %f18};
-	st.global.v2.f32 	[%r9+525056], {%f19, %f20};
-	bar.sync 	0;
-	add.s32 	%r56, %r56, 8192;
-	add.s32 	%r55, %r55, 1024;
-	add.s32 	%r57, %r57, -4;
-	setp.ne.s32 	%p1, %r57, 0;
-	@%p1 bra 	BB0_1;
-
-	ret;
-}
-
-
diff --git a/RTCP/GPUProc/src/CL/cl.hpp b/RTCP/GPUProc/src/CL/cl.hpp
index 5111f37dc8c74a540999ff6a06b9ae7ec07b6455..aa2b0bd040fd81ed57f433c242dcc1b39693f35a 100644
--- a/RTCP/GPUProc/src/CL/cl.hpp
+++ b/RTCP/GPUProc/src/CL/cl.hpp
@@ -1,4 +1,4 @@
-#include <iostream>
+//#include <iostream>
 /*******************************************************************************
  * Copyright (c) 2008-2011 The Khronos Group Inc.
  *
@@ -2069,9 +2069,9 @@ public:
     {
         cl_int error;
         object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
-if (!(flags & CL_MEM_ALLOC_HOST_PTR))
-#pragma omp critical (cout)
-std::cout << "A: Allocating " << size << " bytes" << std::endl;
+//if (!(flags & CL_MEM_ALLOC_HOST_PTR))
+//#pragma omp critical (cout)
+//std::cout << "A: Allocating " << size << " bytes" << std::endl;
 
         detail::errHandler(error, __CREATE_BUFFER_ERR);
         if (err != NULL) {
@@ -2090,9 +2090,9 @@ std::cout << "A: Allocating " << size << " bytes" << std::endl;
         Context context = Context::getDefault(err);
 
         object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
-if (!(flags & CL_MEM_ALLOC_HOST_PTR))
-#pragma omp critical (cout)
-std::cout << "B: Allocating " << size << " bytes" << std::endl;
+//if (!(flags & CL_MEM_ALLOC_HOST_PTR))
+//#pragma omp critical (cout)
+//std::cout << "B: Allocating " << size << " bytes" << std::endl;
 
         detail::errHandler(error, __CREATE_BUFFER_ERR);
         if (err != NULL) {
diff --git a/RTCP/GPUProc/src/Correlator.cl b/RTCP/GPUProc/src/Correlator.cl
index 9ca1ce91620cc5927211907be030ff33626164ce..5d41253f40d8ca7cabe4bc7a640ab1309ea913fd 100644
--- a/RTCP/GPUProc/src/Correlator.cl
+++ b/RTCP/GPUProc/src/Correlator.cl
@@ -1,7 +1,7 @@
 #define NR_BASELINES	 (NR_STATIONS * (NR_STATIONS + 1) / 2)
 
 #if NR_STATIONS == 288
-#define BLOCK_SIZE	 8
+#define BLOCK_SIZE	 6
 #elif defined NVIDIA_CUDA && NR_SAMPLES_PER_CHANNEL % 24 == 0
 #define BLOCK_SIZE	 24
 #else
@@ -24,7 +24,7 @@ __kernel void correlate(__global void *visibilitiesPtr,
   __local float samples[4][BLOCK_SIZE][NR_STATIONS | 1]; // avoid power-of-2
 
   uint baseline	    = get_global_id(0);
-  uint channel	    = get_global_id(1);
+  uint channel	    = get_global_id(1) + 1;
   uint stat_0	    = convert_uint_rtz(sqrt(convert_float(8 * baseline + 1)) - 0.99999f) / 2;
   uint stat_A	    = baseline - stat_0 * (stat_0 + 1) / 2;
 
@@ -84,7 +84,7 @@ __kernel void correlate_2x2(__global void *visibilitiesPtr,
 
   __local float4 samples[2][BLOCK_SIZE][(NR_STATIONS + 1) / 2 | 1]; // avoid power-of-2
 
-  uint channel	    = get_global_id(1);
+  uint channel	    = get_global_id(1) + 1;
   uint block	    = get_global_id(0);
 
   uint x	    = convert_uint_rtz(sqrt(convert_float(8 * block + 1)) - 0.99999f) / 2;
@@ -181,7 +181,7 @@ __kernel void correlate_3x3(__global void *visibilitiesPtr,
 
   __local float4 samples[3][BLOCK_SIZE][(NR_STATIONS + 2) / 3 | 1]; // avoid power-of-2
 
-  uint channel	    = get_global_id(1);
+  uint channel	    = get_global_id(1) + 1;
   uint block	    = get_global_id(0);
 
   uint x	    = convert_uint_rtz(sqrt(convert_float(8 * block + 1)) - 0.99999f) / 2;
@@ -338,7 +338,7 @@ __kernel void correlate_4x4(__global void *visibilitiesPtr,
 
   __local float4 samples[4][BLOCK_SIZE][(NR_STATIONS + 3) / 4 | 1]; // avoid power-of-2
 
-  uint channel	    = get_global_id(1);
+  uint channel	    = get_global_id(1) + 1;
   uint block	    = get_global_id(0);
 
   uint x	    = convert_uint_rtz(sqrt(convert_float(8 * block + 1)) - 0.99999f) / 2;
diff --git a/RTCP/GPUProc/src/Correlator.cl-0.ptx b/RTCP/GPUProc/src/Correlator.cl-0.ptx
deleted file mode 100644
index 37d40c5127c10f59970bebd14f4800ba1f03e8a1..0000000000000000000000000000000000000000
Binary files a/RTCP/GPUProc/src/Correlator.cl-0.ptx and /dev/null differ
diff --git a/RTCP/GPUProc/src/DelayAndBandPass.cl-0.ptx b/RTCP/GPUProc/src/DelayAndBandPass.cl-0.ptx
deleted file mode 100644
index 765b24b939aaab3b67274c4aedf221c92af81818..0000000000000000000000000000000000000000
Binary files a/RTCP/GPUProc/src/DelayAndBandPass.cl-0.ptx and /dev/null differ
diff --git a/RTCP/GPUProc/src/FIR.cl-0.ptx b/RTCP/GPUProc/src/FIR.cl-0.ptx
deleted file mode 100644
index d1a6fd1785efd5436cb75094fa168a1692b1643d..0000000000000000000000000000000000000000
Binary files a/RTCP/GPUProc/src/FIR.cl-0.ptx and /dev/null differ
diff --git a/RTCP/GPUProc/src/FilterBank.cc b/RTCP/GPUProc/src/FilterBank.cc
index beacf7cbcdf86a2243fda08c43abef4096a3b253..b33756a68ba03a56684f7a1d9025a730374293c8 100644
--- a/RTCP/GPUProc/src/FilterBank.cc
+++ b/RTCP/GPUProc/src/FilterBank.cc
@@ -174,152 +174,155 @@ void FilterBank::interpolate(const double x[], const double y[], unsigned xlen,
 // but has better stopband characteristics.
 void FilterBank::generate_fir_filter(unsigned n, double w, const double window[], double result[])
 {
-  // make sure grid is big enough for the window
-  // the grid must be at least (n+1)/2
-  // for all filters where the order is a power of two minus 1, grid_n = n+1;
-  unsigned grid_n = nextPowerOfTwo(n + 1);
+#pragma omp critical (FFTW)
+  {
+    // make sure grid is big enough for the window
+    // the grid must be at least (n+1)/2
+    // for all filters where the order is a power of two minus 1, grid_n = n+1;
+    unsigned grid_n = nextPowerOfTwo(n + 1);
 
-  unsigned ramp_n = 2; // grid_n/20;
+    unsigned ramp_n = 2; // grid_n/20;
 
-  // Apply ramps to discontinuities
-  // this is a low pass filter
-  // maybe we can omit the "w, 0" point?
-  // I did observe a small difference
-  double f[] = { 0.0, w - ramp_n / grid_n / 2.0, w, w + ramp_n / grid_n / 2.0, 1.0 };
-  double m[] = { 1.0, 1.0, 0.0, 0.0, 0.0 };
+    // Apply ramps to discontinuities
+    // this is a low pass filter
+    // maybe we can omit the "w, 0" point?
+    // I did observe a small difference
+    double f[] = { 0.0, w - ramp_n / grid_n / 2.0, w, w + ramp_n / grid_n / 2.0, 1.0 };
+    double m[] = { 1.0, 1.0, 0.0, 0.0, 0.0 };
 
-  // grid is a 1-D array with grid_n+1 points. Values are 1 in filter passband, 0 otherwise
-  std::vector<double> grid(grid_n + 1);
+    // grid is a 1-D array with grid_n+1 points. Values are 1 in filter passband, 0 otherwise
+    std::vector<double> grid(grid_n + 1);
 
-  // interpolate between grid points
-  interpolate(f, m, 5 /* length of f and m arrays */, grid_n + 1, &grid[0]);
+    // interpolate between grid points
+    interpolate(f, m, 5 /* length of f and m arrays */, grid_n + 1, &grid[0]);
 
 #if 0
-  std::stringstream logStr;
-  logStr << "interpolated = [";
-  for(unsigned i=0; i<grid_n+1; i++) {
-    logStr << grid[i];
-    if(i != grid_n+1-1) logStr << ", ";
-  }
-  logStr << "];";
-  LOG_DEBUG(logStr.str());
+    std::stringstream logStr;
+    logStr << "interpolated = [";
+    for(unsigned i=0; i<grid_n+1; i++) {
+      logStr << grid[i];
+      if(i != grid_n+1-1) logStr << ", ";
+    }
+    logStr << "];";
+    LOG_DEBUG(logStr.str());
 #endif
 
-  // the grid we do an ifft on is:
-  // grid appended with grid_n*2 zeros
-  // appended with original grid values from indices grid_n..2, i.e., the values in reverse order
-  // (note, arrays start at 1 in octave!)
-  // the input for the ifft is of size 4*grid_n
-  // input = [grid ; zeros(grid_n*2,1) ;grid(grid_n:-1:2)];
+    // the grid we do an ifft on is:
+    // grid appended with grid_n*2 zeros
+    // appended with original grid values from indices grid_n..2, i.e., the values in reverse order
+    // (note, arrays start at 1 in octave!)
+    // the input for the ifft is of size 4*grid_n
+    // input = [grid ; zeros(grid_n*2,1) ;grid(grid_n:-1:2)];
 
 #if defined HAVE_FFTW3
-  fftwf_complex* cinput = (fftwf_complex*) fftwf_malloc(grid_n * 4 * sizeof(fftwf_complex));
-  fftwf_complex* coutput = (fftwf_complex*) fftwf_malloc(grid_n * 4 * sizeof(fftwf_complex));
+    fftwf_complex* cinput = (fftwf_complex*) fftwf_malloc(grid_n * 4 * sizeof(fftwf_complex));
+    fftwf_complex* coutput = (fftwf_complex*) fftwf_malloc(grid_n * 4 * sizeof(fftwf_complex));
 #elif defined HAVE_FFTW2
-  fftw_complex* cinput = (fftw_complex*) fftw_malloc(grid_n*4*sizeof(fftw_complex));
-  fftw_complex* coutput = (fftw_complex*) fftw_malloc(grid_n*4*sizeof(fftw_complex));
+    fftw_complex* cinput = (fftw_complex*) fftw_malloc(grid_n*4*sizeof(fftw_complex));
+    fftw_complex* coutput = (fftw_complex*) fftw_malloc(grid_n*4*sizeof(fftw_complex));
 #endif
 
-  if (cinput == NULL || coutput == NULL) {
-    THROW(GPUProcException, "cannot allocate buffers");
-  }
+    if (cinput == NULL || coutput == NULL) {
+      THROW(GPUProcException, "cannot allocate buffers");
+    }
 
-  // wipe imaginary part
-  for (unsigned i = 0; i < grid_n * 4; i++) {
-    fftw_imag(cinput[i]) = 0.0;
-  }
+    // wipe imaginary part
+    for (unsigned i = 0; i < grid_n * 4; i++) {
+      fftw_imag(cinput[i]) = 0.0;
+    }
 
-  // copy first part of grid
-  for (unsigned i = 0; i < grid_n + 1; i++) {
-    fftw_real(cinput[i]) = grid[i];
-  }
+    // copy first part of grid
+    for (unsigned i = 0; i < grid_n + 1; i++) {
+      fftw_real(cinput[i]) = grid[i];
+    }
 
-  // append zeros
-  for (unsigned i = grid_n + 1; i <= grid_n * 3; i++) {
-    fftw_real(cinput[i]) = 0.0;
-  }
+    // append zeros
+    for (unsigned i = grid_n + 1; i <= grid_n * 3; i++) {
+      fftw_real(cinput[i]) = 0.0;
+    }
 
-  // now append the grid in reverse order
-  for (unsigned i = grid_n - 1, index = 0; i >= 1; i --, index ++) {
-    fftw_real(cinput[grid_n * 3 + 1 + index]) = grid[i];
-  }
+    // now append the grid in reverse order
+    for (unsigned i = grid_n - 1, index = 0; i >= 1; i --, index ++) {
+      fftw_real(cinput[grid_n * 3 + 1 + index]) = grid[i];
+    }
 
 #if 0
-  std::stringstream logStr;
-  logStr << "ifft_in = [";
-  for(unsigned i=0; i<grid_n*4; i++) {
-    logStr << fftw_real(cinput[i]) << " " << fftw_imag(cinput[i]);
-    if(i != grid_n*4-1) logStr << ", ";
-  }
-  logStr << "];";
-  LOG_DEBUG(logStr.str());
+    std::stringstream logStr;
+    logStr << "ifft_in = [";
+    for(unsigned i=0; i<grid_n*4; i++) {
+      logStr << fftw_real(cinput[i]) << " " << fftw_imag(cinput[i]);
+      if(i != grid_n*4-1) logStr << ", ";
+    }
+    logStr << "];";
+    LOG_DEBUG(logStr.str());
 #endif
 
 #if defined HAVE_FFTW3
-  fftwf_plan plan = fftwf_plan_dft_1d(grid_n * 4, cinput, coutput, FFTW_BACKWARD, FFTW_ESTIMATE);
-  fftwf_execute(plan);
+    fftwf_plan plan = fftwf_plan_dft_1d(grid_n * 4, cinput, coutput, FFTW_BACKWARD, FFTW_ESTIMATE);
+    fftwf_execute(plan);
 #elif defined HAVE_FFTW2
-  fftw_plan plan = fftw_create_plan(grid_n * 4, FFTW_BACKWARD, FFTW_ESTIMATE);
-  fftw_one(plan, cinput, coutput);
+    fftw_plan plan = fftw_create_plan(grid_n * 4, FFTW_BACKWARD, FFTW_ESTIMATE);
+    fftw_one(plan, cinput, coutput);
 #endif
 
 #if 0
-  for(unsigned i=0; i<grid_n*4; i++) {
-    LOG_DEBUG_STR("ifft result [" << i << "] = " << fftw_real(coutput[i]) << " " << fftw_imag(coutput[i]));
-  }
+    for(unsigned i=0; i<grid_n*4; i++) {
+      LOG_DEBUG_STR("ifft result [" << i << "] = " << fftw_real(coutput[i]) << " " << fftw_imag(coutput[i]));
+    }
 #endif
 
-  //                        half                   end
-  // 1 2       n+1          2(n+1)      3(n+1)     4(n+1)
-  //                                    x x x x x x x x x     # last quarter
-  //   x x x x x x                                            # first quarter
+    //                        half                   end
+    // 1 2       n+1          2(n+1)      3(n+1)     4(n+1)
+    //                                    x x x x x x x x x     # last quarter
+    //   x x x x x x                                            # first quarter
 
-  // last_quarter  = b([end-n+1:2:end]); # the size is only 1/8, since we skip half of the elements
-  // first_quarter = b(2:2:(n+1));       # the size is only 1/8, since we skip half of the elements
+    // last_quarter  = b([end-n+1:2:end]); # the size is only 1/8, since we skip half of the elements
+    // first_quarter = b(2:2:(n+1));       # the size is only 1/8, since we skip half of the elements
 
-  unsigned index = 0;
+    unsigned index = 0;
 
-  for (unsigned i = 4 * grid_n - n; i < 4 * grid_n; i += 2) {
-    result[index] = fftw_real(coutput[i]);
-    index++;
-  }
+    for (unsigned i = 4 * grid_n - n; i < 4 * grid_n; i += 2) {
+      result[index] = fftw_real(coutput[i]);
+      index++;
+    }
 
-  for (unsigned i = 1; i <= n; i += 2) {
-    result[index] = fftw_real(coutput[i]);
-    index++;
-  }
+    for (unsigned i = 1; i <= n; i += 2) {
+      result[index] = fftw_real(coutput[i]);
+      index++;
+    }
 
 #if defined HAVE_FFTW3
-  fftwf_destroy_plan(plan);
-  fftwf_free(cinput);
-  fftwf_free(coutput);
+    fftwf_destroy_plan(plan);
+    fftwf_free(cinput);
+    fftwf_free(coutput);
 #elif defined HAVE_FFTW2
-  fftw_destroy_plan(plan);
-  fftw_free(cinput);
-  fftw_free(coutput);
+    fftw_destroy_plan(plan);
+    fftw_free(cinput);
+    fftw_free(coutput);
 #endif
 
-  // multiply with window
-  for (unsigned i = 0; i <= n; i++) {
-    result[i] *= window[i];
-  }
+    // multiply with window
+    for (unsigned i = 0; i <= n; i++) {
+      result[i] *= window[i];
+    }
 
-  // normalize
-  double factor = result[n / 2];
-  for (unsigned i = 0; i <= n; i++) {
-    result[i] /= factor;
-  }
+    // normalize
+    double factor = result[n / 2];
+    for (unsigned i = 0; i <= n; i++) {
+      result[i] /= factor;
+    }
 
 #if 0
-  std::stringstream logStr;
-  logStr << "result = [";
-  for(unsigned i=0; i<=n; i++) {
-    logStr << result[i];
-    if(i != n) logStr << ", ";
-  }
-  logStr << "];";
-  LOG_DEBUG(logStr.str());
+    std::stringstream logStr;
+    logStr << "result = [";
+    for(unsigned i=0; i<=n; i++) {
+      logStr << result[i];
+      if(i != n) logStr << ", ";
+    }
+    logStr << "];";
+    LOG_DEBUG(logStr.str());
 #endif
+  }
 }
 
 
diff --git a/RTCP/GPUProc/src/NewCorrelator.cl b/RTCP/GPUProc/src/NewCorrelator.cl
index 307c70f347743ea6201b6fd2a87fad59ad62812d..6d4e6ee4bc3782018a89d6287278253fc1a59c1e 100644
--- a/RTCP/GPUProc/src/NewCorrelator.cl
+++ b/RTCP/GPUProc/src/NewCorrelator.cl
@@ -1,72 +1,124 @@
 #define NR_STATIONS_PER_BLOCK	32
-#define BLOCK_SIZE		8
+#define NR_TIMES_PER_BLOCK	8
 
 #define NR_BASELINES		(NR_STATIONS * (NR_STATIONS + 1) / 2)
 
 
-typedef __global float (*CorrectedDataType)[NR_STATIONS][NR_CHANNELS][NR_SAMPLES_PER_CHANNEL][NR_POLARIZATIONS * 2];
+typedef __global float4 (*CorrectedDataType)[NR_STATIONS][NR_CHANNELS][NR_SAMPLES_PER_CHANNEL];
 typedef __global float8 (*VisibilitiesType)[NR_BASELINES][NR_CHANNELS];
 
 
-#if 0
-__kernel void correlateTriangles(__global void *visibilitiesPtr,
-				 __global const void *correctedDataPtr
-)
+__kernel
+void correlateTriangleKernel(__global void *visibilitiesPtr,
+	       __global const void *correctedDataPtr)
 {
   VisibilitiesType visibilities = (VisibilitiesType) visibilitiesPtr;
   CorrectedDataType correctedData = (CorrectedDataType) correctedDataPtr;
 
-  __local float4 samples[BLOCK_SIZE][NR_STATIONS_PER_BLOCK];
+  __local float4 samples[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1];
+  uint channel = get_global_id(2) + 1;
+  uint block = get_global_id(1);
+
+#if NR_STATIONS % NR_STATIONS_PER_BLOCK == 0
+  uint firstStation = block * NR_STATIONS_PER_BLOCK;
+  uint nrStationsThisBlock = NR_STATIONS_PER_BLOCK;
+#else
+  uint lastStation = block * NR_STATIONS_PER_BLOCK + NR_STATIONS % NR_STATIONS_PER_BLOCK;
+  uint firstStation = block == 0 ? 0 : lastStation - NR_STATIONS_PER_BLOCK;
+  uint nrStationsThisBlock = lastStation - firstStation;
+#endif
 
-  uint triangle = get_global_id(1);
-  uint channel  = get_global_id(2);
-  uint firstStation = triangle * NR_STATIONS_PER_BLOCK;
+  uint miniBlock = get_local_id(0);
+  uint statXoffset = convert_uint_rtz(sqrt(convert_float(8 * miniBlock + 1)) - 0.99999f) / 2;
+  uint statYoffset = miniBlock - statXoffset * (statXoffset + 1) / 2;
 
-  float4 vis_0A_r = (float4) 0, vis_0A_i = (float4) 0;
-  float4 vis_0B_r = (float4) 0, vis_0B_i = (float4) 0;
-  float4 vis_1A_r = (float4) 0, vis_1A_i = (float4) 0;
-  float4 vis_1B_r = (float4) 0, vis_1B_i = (float4) 0;
+  statXoffset *= 2, statYoffset *= 2;
 
-  for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += BLOCK_SIZE) {
+  float4 vis_0A_r = 0, vis_0A_i = 0;
+  float4 vis_0B_r = 0, vis_0B_i = 0;
+  float4 vis_1A_r = 0, vis_1A_i = 0;
+  float4 vis_1B_r = 0, vis_1B_i = 0;
+
+  bool doCorrelate = statXoffset < nrStationsThisBlock;
+
+  for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += NR_TIMES_PER_BLOCK) {
     // load data into local memory
-#pragma unroll 1
-    for (uint i = get_local_id(0); i < BLOCK_SIZE * NR_STATIONS_PER_BLOCK; i += get_local_size(0)) {
-      uint time = i % BLOCK_SIZE;
-      uint stat = i / BLOCK_SIZE;
+    barrier(CLK_LOCAL_MEM_FENCE);
 
-      if (firstStation + stat < NR_STATIONS)
-	samples[time][stat] = (*correctedData)[firstStation + stat][channel][major + time];
+    for (uint i = get_local_id(0); i < nrStationsThisBlock * NR_TIMES_PER_BLOCK; i += get_local_size(0)) {
+      uint time = i % NR_TIMES_PER_BLOCK;
+      uint stat = i / NR_TIMES_PER_BLOCK;
+
+      samples[stat % 2][time][stat / 2] = (*correctedData)[firstStation + stat][channel][major + time];
     }
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    // compute auto-correlations
+#pragma unroll 1
+    for (uint time = 0; time < NR_TIMES_PER_BLOCK; time ++) {
+      float4 sample_0, sample_1, sample_A, sample_B;
+
+      if (doCorrelate) {
+        sample_0 = samples[0][time][statYoffset / 2];
+        sample_A = samples[0][time][statXoffset / 2];
+        sample_B = samples[1][time][statXoffset / 2];
+        sample_1 = samples[1][time][statYoffset / 2];
+
+        vis_0A_r += sample_0.xxzz * sample_A.xzxz;
+        vis_0A_i += sample_0.yyww * sample_A.xzxz;
+        vis_0B_r += sample_0.xxzz * sample_B.xzxz;
+        vis_0B_i += sample_0.yyww * sample_B.xzxz;
+        vis_1A_r += sample_1.xxzz * sample_A.xzxz;
+        vis_1A_i += sample_1.yyww * sample_A.xzxz;
+        vis_1B_r += sample_1.xxzz * sample_B.xzxz;
+        vis_1B_i += sample_1.yyww * sample_B.xzxz;
 
-    if (firstStation + get_local_id(0) < NR_STATIONS) {
-      for (time = 0; time + BLOCK_SIZE; time ++) {
-	float sample = samples[time][get_local_id(0)];
+        vis_0A_r += sample_0.yyww * sample_A.ywyw;
+        vis_0A_i -= sample_0.xxzz * sample_A.ywyw;
+        vis_0B_r += sample_0.yyww * sample_B.ywyw;
+        vis_0B_i -= sample_0.xxzz * sample_B.ywyw;
+        vis_1A_r += sample_1.yyww * sample_A.ywyw;
+        vis_1A_i -= sample_1.xxzz * sample_A.ywyw;
+        vis_1B_r += sample_1.yyww * sample_B.ywyw;
+        vis_1B_i -= sample_1.xxzz * sample_B.ywyw;
       }
+    }
+  }
 
+  int  statY = firstStation + statYoffset;
+  uint statX = firstStation + statXoffset;
+  uint baseline = (statX * (statX + 1) / 2) + statY;
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+  if (statXoffset < nrStationsThisBlock) {
+    (*visibilities)[baseline	        ][channel].even = vis_0A_r;
+    (*visibilities)[baseline	        ][channel].odd  = vis_0A_i;
+  }
+
+  if (statXoffset < nrStationsThisBlock && statYoffset + 1 < nrStationsThisBlock) {
+    (*visibilities)[baseline	     + 1][channel].even = vis_1A_r;
+    (*visibilities)[baseline	     + 1][channel].odd  = vis_1A_i;
+  }
+
+  if (statXoffset + 1 < nrStationsThisBlock) {
+    (*visibilities)[baseline + statX + 1][channel].even = vis_0B_r;
+    (*visibilities)[baseline + statX + 1][channel].odd  = vis_0B_i;
+    (*visibilities)[baseline + statX + 2][channel].even = vis_1B_r;
+    (*visibilities)[baseline + statX + 2][channel].odd  = vis_1B_i;
   }
 }
-#endif
 
 
 __kernel __attribute__((reqd_work_group_size(NR_STATIONS_PER_BLOCK * NR_STATIONS_PER_BLOCK / 4, 1, 1)))
-void correlateRectangles(__global void *visibilitiesPtr,
-			 __global const void *correctedDataPtr
-)
+void correlateRectangleKernel(__global void *visibilitiesPtr,
+	       __global const void *correctedDataPtr)
 {
-  VisibilitiesType visibilities = (VisibilitiesType) visibilitiesPtr;
+  VisibilitiesType  visibilities  = (VisibilitiesType)  visibilitiesPtr;
   CorrectedDataType correctedData = (CorrectedDataType) correctedDataPtr;
 
-  __local float4 samplesX[2][BLOCK_SIZE][NR_STATIONS_PER_BLOCK / 2 | 1];
-  __local float4 samplesY[2][BLOCK_SIZE][NR_STATIONS_PER_BLOCK / 2 | 1];
+  __local float4 samplesX[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1];
+  __local float4 samplesY[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1];
 
   uint block	 = get_global_id(1);
-  uint channel   = get_global_id(2);
   uint blockX	 = convert_uint_rtz(sqrt(convert_float(8 * block + 1)) - 0.99999f) / 2;
   uint blockY	 = block - blockX * (blockX + 1) / 2;
 
@@ -78,35 +130,291 @@ void correlateRectangles(__global void *visibilitiesPtr,
   int  firstStationY = (blockY - 1) * NR_STATIONS_PER_BLOCK + NR_STATIONS % NR_STATIONS_PER_BLOCK;
 #endif
 
-  uint statXoffset = (get_local_id(0) / (NR_STATIONS_PER_BLOCK / 2));
-  uint statYoffset = (get_local_id(0) % (NR_STATIONS_PER_BLOCK / 2));
+  uint statXoffset = get_local_id(0) / (NR_STATIONS_PER_BLOCK / 2);
+  uint statYoffset = get_local_id(0) % (NR_STATIONS_PER_BLOCK / 2);
+
+  float4 vis_0A_r = 0, vis_0A_i = 0;
+  float4 vis_0B_r = 0, vis_0B_i = 0;
+  float4 vis_1A_r = 0, vis_1A_i = 0;
+  float4 vis_1B_r = 0, vis_1B_i = 0;
+
+  uint loadTime = get_local_id(0) % NR_TIMES_PER_BLOCK;
+  uint loadStat = get_local_id(0) / NR_TIMES_PER_BLOCK;
+
+  bool doCorrelateLower = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + 2 * statYoffset) >= 0;
+  bool doCorrelateUpper = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + 2 * statYoffset) >= -1;
+  bool doLoadY     = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + loadStat) >= 0;
+
+  uint channel = get_global_id(2) + 1;
+
+  for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += NR_TIMES_PER_BLOCK) {
+    // load data into local memory
+    float4 sampleX = (*correctedData)[firstStationX + loadStat][channel][major + loadTime];
+    float4 sampleY;
+
+    if (doLoadY)
+      sampleY = (*correctedData)[firstStationY + loadStat][channel][major + loadTime];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    samplesX[loadStat % 2][loadTime][loadStat / 2] = sampleX;
+
+    if (doLoadY)
+      samplesY[loadStat % 2][loadTime][loadStat / 2] = sampleY;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+#pragma unroll 1
+    for (uint time = 0; time < NR_TIMES_PER_BLOCK; time ++) {
+      float4 sample_0, sample_1, sample_A, sample_B;
+
+      if (doCorrelateLower) {
+        sample_0 = samplesY[0][time][statYoffset];
+      }
+
+      if (doCorrelateUpper) {
+        sample_A = samplesX[0][time][statXoffset];
+        sample_B = samplesX[1][time][statXoffset];
+        sample_1 = samplesY[1][time][statYoffset];
+      }
+
+      if (doCorrelateLower) {
+        vis_0A_r += sample_0.xxzz * sample_A.xzxz;
+        vis_0A_i += sample_0.yyww * sample_A.xzxz;
+        vis_0B_r += sample_0.xxzz * sample_B.xzxz;
+        vis_0B_i += sample_0.yyww * sample_B.xzxz;
+        vis_0A_r += sample_0.yyww * sample_A.ywyw;
+        vis_0A_i -= sample_0.xxzz * sample_A.ywyw;
+        vis_0B_r += sample_0.yyww * sample_B.ywyw;
+        vis_0B_i -= sample_0.xxzz * sample_B.ywyw;
+      }
+
+      if (doCorrelateUpper) {
+        vis_1A_r += sample_1.xxzz * sample_A.xzxz;
+        vis_1A_i += sample_1.yyww * sample_A.xzxz;
+        vis_1B_r += sample_1.xxzz * sample_B.xzxz;
+        vis_1B_i += sample_1.yyww * sample_B.xzxz;
+        vis_1A_r += sample_1.yyww * sample_A.ywyw;
+        vis_1A_i -= sample_1.xxzz * sample_A.ywyw;
+        vis_1B_r += sample_1.yyww * sample_B.ywyw;
+        vis_1B_i -= sample_1.xxzz * sample_B.ywyw;
+      }
+    }
+  }
+
+  int  statY = firstStationY + 2 * statYoffset;
+  uint statX = firstStationX + 2 * statXoffset;
+  uint baseline = (statX * (statX + 1) / 2) + statY;
+
+  if (doCorrelateLower) {
+    (*visibilities)[baseline	        ][channel].even = vis_0A_r;
+    (*visibilities)[baseline	        ][channel].odd  = vis_0A_i;
+    (*visibilities)[baseline + statX + 1][channel].even = vis_0B_r;
+    (*visibilities)[baseline + statX + 1][channel].odd  = vis_0B_i;
+  }
+
+  if (doCorrelateUpper) {
+    (*visibilities)[baseline	     + 1][channel].even = vis_1A_r;
+    (*visibilities)[baseline	     + 1][channel].odd  = vis_1A_i;
+    (*visibilities)[baseline + statX + 2][channel].even = vis_1B_r;
+    (*visibilities)[baseline + statX + 2][channel].odd  = vis_1B_i;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void correlateTriangle(VisibilitiesType visibilities,
+		       CorrectedDataType correctedData,
+		       __local float4 samples[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1],
+		       uint block)
+{
+  uint channel = get_global_id(2) + 1;
+
+#if NR_STATIONS % NR_STATIONS_PER_BLOCK == 0
+  uint firstStation = block * NR_STATIONS_PER_BLOCK;
+#else
+  int firstStation = (block - 1) * NR_STATIONS_PER_BLOCK + NR_STATIONS % NR_STATIONS_PER_BLOCK;
+#endif
+
+  bool doCorrelate = false, doAutoCorrelate = false, doNearAutoCorrelate = false;
+  uint statXoffset, statYoffset;
+
+  if (get_local_id(0) < 128) {
+    uint miniBlock = get_local_id(0);
+    uint miniBlockX = convert_uint_rtz(sqrt(convert_float(8 * miniBlock + 1)) - 0.99999f) / 2;
+    uint miniBlockY = miniBlock - miniBlockX * (miniBlockX + 1) / 2;
+
+    statXoffset = 2 * miniBlockX + 2;
+    statYoffset = 2 * miniBlockY;
+    doCorrelate = statXoffset < NR_STATIONS_PER_BLOCK && (NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStation + statYoffset) >= 0);
+  } else if (get_local_id(0) < 128 + NR_STATIONS_PER_BLOCK / 2) {
+    statXoffset = statYoffset = 2 * (get_local_id(0) - 128);
+    // actually, it is the visibility one right of statXoffset
+    doNearAutoCorrelate = (int) (firstStation + statXoffset) >= 0;
+  } else if (get_local_id(0) >= 192 && get_local_id(0) < 192 + NR_STATIONS_PER_BLOCK) {
+    statXoffset = statYoffset = get_local_id(0) - 192;
+    doAutoCorrelate = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStation + statYoffset) >= 0;
+  }
 
   float4 vis_0A_r = 0, vis_0A_i = 0;
   float4 vis_0B_r = 0, vis_0B_i = 0;
   float4 vis_1A_r = 0, vis_1A_i = 0;
   float4 vis_1B_r = 0, vis_1B_i = 0;
 
-  for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += BLOCK_SIZE) {
+  uint loadTime = get_local_id(0) % NR_TIMES_PER_BLOCK;
+  uint loadStat = get_local_id(0) / NR_TIMES_PER_BLOCK;
+
+  bool doLoad     = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStation + loadStat) >= 0;
+
+  for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += NR_TIMES_PER_BLOCK) {
     // load data into local memory
-    for (uint i = get_local_id(0); i < 4 * BLOCK_SIZE * NR_STATIONS_PER_BLOCK; i += NR_STATIONS_PER_BLOCK * NR_STATIONS_PER_BLOCK / 4) {
-      uint p = i % 4;
-      uint time = i / 4 % BLOCK_SIZE;
-      uint stat = i / 4 / BLOCK_SIZE;
+    float4 sample;
+
+    if (doLoad)
+      sample = (*correctedData)[firstStation + loadStat][channel][major + loadTime];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (doLoad)
+      samples[loadStat % 2][loadTime][loadStat / 2] = sample;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+#pragma unroll 1
+    for (uint time = 0; time < NR_TIMES_PER_BLOCK; time ++) {
+      if (doCorrelate) {
+        float4 sample_0 = samples[0][time][statYoffset / 2];
+        float4 sample_A = samples[0][time][statXoffset / 2];
+        float4 sample_B = samples[1][time][statXoffset / 2];
+        float4 sample_1 = samples[1][time][statYoffset / 2];
+
+        vis_0A_r += sample_0.xxzz * sample_A.xzxz;
+        vis_0A_i += sample_0.yyww * sample_A.xzxz;
+        vis_0B_r += sample_0.xxzz * sample_B.xzxz;
+        vis_0B_i += sample_0.yyww * sample_B.xzxz;
+        vis_0A_r += sample_0.yyww * sample_A.ywyw;
+        vis_0A_i -= sample_0.xxzz * sample_A.ywyw;
+        vis_0B_r += sample_0.yyww * sample_B.ywyw;
+        vis_0B_i -= sample_0.xxzz * sample_B.ywyw;
+
+        vis_1A_r += sample_1.xxzz * sample_A.xzxz;
+        vis_1A_i += sample_1.yyww * sample_A.xzxz;
+        vis_1B_r += sample_1.xxzz * sample_B.xzxz;
+        vis_1B_i += sample_1.yyww * sample_B.xzxz;
+        vis_1A_r += sample_1.yyww * sample_A.ywyw;
+        vis_1A_i -= sample_1.xxzz * sample_A.ywyw;
+        vis_1B_r += sample_1.yyww * sample_B.ywyw;
+        vis_1B_i -= sample_1.xxzz * sample_B.ywyw;
+      }
 
-      ((__local float *) &samplesX[stat % 2][time][stat / 2])[p] = (*correctedData)[firstStationX + stat][channel][major + time][p];
+      if (doAutoCorrelate) {
+        float4 sample_0 = samples[statYoffset % 2][time][statYoffset / 2];
+	vis_0A_r.xyw += sample_0.xxz * sample_0.xzz;
+	vis_0A_i.y   += sample_0.y * sample_0.z;
+	vis_0A_r.xyw += sample_0.yyw * sample_0.yww;
+	vis_0A_i.y   -= sample_0.x * sample_0.w;
+      }
 
-      if (NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + stat) >= 0)
-	((__local float *) &samplesY[stat % 2][time][stat / 2])[p] = (*correctedData)[firstStationY + stat][channel][major + time][p];
+      if (doNearAutoCorrelate) {
+        float4 sample_0 = samples[0][time][statYoffset / 2];
+        float4 sample_B = samples[1][time][statXoffset / 2];
+        vis_0B_r += sample_0.xxzz * sample_B.xzxz;
+        vis_0B_i += sample_0.yyww * sample_B.xzxz;
+        vis_0B_r += sample_0.yyww * sample_B.ywyw;
+        vis_0B_i -= sample_0.xxzz * sample_B.ywyw;
+      }
     }
+  }
+
+  if (doAutoCorrelate) {
+    vis_0A_r.z = vis_0A_r.y;
+    vis_0A_i.z = -vis_0A_i.y;
+  }
+
+  int  statY = firstStation + statYoffset;
+  uint statX = firstStation + statXoffset;
+  uint baseline = (statX * (statX + 1) / 2) + statY;
+
+  if (doCorrelate || doAutoCorrelate) {
+    (*visibilities)[baseline	        ][channel].even = vis_0A_r;
+    (*visibilities)[baseline	        ][channel].odd  = vis_0A_i;
+  }
+
+  if (doCorrelate || doNearAutoCorrelate) {
+    (*visibilities)[baseline + statX + 1][channel].even = vis_0B_r;
+    (*visibilities)[baseline + statX + 1][channel].odd  = vis_0B_i;
+  }
+
+  if (doCorrelate) {
+    (*visibilities)[baseline	     + 1][channel].even = vis_1A_r;
+    (*visibilities)[baseline	     + 1][channel].odd  = vis_1A_i;
+    (*visibilities)[baseline + statX + 2][channel].even = vis_1B_r;
+    (*visibilities)[baseline + statX + 2][channel].odd  = vis_1B_i;
+  }
+}
+
+
+void correlateTriangle2(VisibilitiesType visibilities,
+			CorrectedDataType correctedData,
+			__local float4 samples[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1],
+			uint block
+)
+{
+  uint channel = get_global_id(2) + 1;
+
+#if NR_STATIONS % NR_STATIONS_PER_BLOCK == 0
+  uint firstStation = block * NR_STATIONS_PER_BLOCK;
+  //uint lastStation = firstStation + NR_STATIONS_PER_BLOCK;
+  uint nrStationsThisBlock = NR_STATIONS_PER_BLOCK;
+#else
+  uint lastStation = block * NR_STATIONS_PER_BLOCK + NR_STATIONS % NR_STATIONS_PER_BLOCK;
+  uint firstStation = block == 0 ? 0 : lastStation - NR_STATIONS_PER_BLOCK;
+  uint nrStationsThisBlock = lastStation - firstStation;
+#endif
+
+  uint miniBlock = get_local_id(0);
+  uint statXoffset = convert_uint_rtz(sqrt(convert_float(8 * miniBlock + 1)) - 0.99999f) / 2;
+  uint statYoffset = miniBlock - statXoffset * (statXoffset + 1) / 2;
+
+  statXoffset *= 2, statYoffset *= 2;
+
+  //bool doCorrelate = statXoffset < nrStationsThisBlock;
+
+  float4 vis_0A_r = 0, vis_0A_i = 0;
+  float4 vis_0B_r = 0, vis_0B_i = 0;
+  float4 vis_1A_r = 0, vis_1A_i = 0;
+  float4 vis_1B_r = 0, vis_1B_i = 0;
+
+  uint loadTime = get_local_id(0) % NR_TIMES_PER_BLOCK;
+  uint loadStat = get_local_id(0) / NR_TIMES_PER_BLOCK;
+
+  bool doCorrelateLeft = statXoffset < nrStationsThisBlock;
+  //bool doCorrelateRight = statXoffset + 1 < nrStationsThisBlock;
+  bool doLoad = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || loadStat < nrStationsThisBlock;
+
+  for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += NR_TIMES_PER_BLOCK) {
+    // load data into local memory
+    float4 sample;
+
+    if (doLoad)
+      sample = (*correctedData)[firstStation + loadStat][channel][major + loadTime];
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if (NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + 2 * statYoffset) >= -1) {
-      for (uint time = 0; time < BLOCK_SIZE; time ++) {
-        float4 sample_0 = samplesY[0][time][statYoffset];
-        float4 sample_A = samplesX[0][time][statXoffset];
-        float4 sample_B = samplesX[1][time][statXoffset];
-        float4 sample_1 = samplesY[1][time][statYoffset];
+    if (doLoad)
+      samples[loadStat % 2][loadTime][loadStat / 2] = sample;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+#pragma unroll 1
+    for (uint time = 0; time < NR_TIMES_PER_BLOCK; time ++) {
+      float4 sample_0, sample_1, sample_A, sample_B;
+
+      if (doCorrelateLeft) {
+        sample_0 = samples[0][time][statYoffset / 2];
+        sample_A = samples[0][time][statXoffset / 2];
+        sample_B = samples[1][time][statXoffset / 2];
+        sample_1 = samples[1][time][statYoffset / 2];
 
         vis_0A_r += sample_0.xxzz * sample_A.xzxz;
         vis_0A_i += sample_0.yyww * sample_A.xzxz;
@@ -127,25 +435,152 @@ void correlateRectangles(__global void *visibilitiesPtr,
         vis_1B_i -= sample_1.xxzz * sample_B.ywyw;
       }
     }
+  }
+
+  int  statY = firstStation + statYoffset;
+  uint statX = firstStation + statXoffset;
+  uint baseline = (statX * (statX + 1) / 2) + statY;
+
+  if (statXoffset < nrStationsThisBlock) {
+    (*visibilities)[baseline	        ][channel].even = vis_0A_r;
+    (*visibilities)[baseline	        ][channel].odd  = vis_0A_i;
+  }
+
+  if (statXoffset < nrStationsThisBlock && statYoffset + 1 < nrStationsThisBlock) {
+    (*visibilities)[baseline	     + 1][channel].even = vis_1A_r;
+    (*visibilities)[baseline	     + 1][channel].odd  = vis_1A_i;
+  }
+
+  if (statXoffset + 1 < nrStationsThisBlock) {
+    (*visibilities)[baseline + statX + 1][channel].even = vis_0B_r;
+    (*visibilities)[baseline + statX + 1][channel].odd  = vis_0B_i;
+    (*visibilities)[baseline + statX + 2][channel].even = vis_1B_r;
+    (*visibilities)[baseline + statX + 2][channel].odd  = vis_1B_i;
+  }
+}
+
+
+void correlateRectangle(VisibilitiesType visibilities,
+			CorrectedDataType correctedData,
+			__local float4 samplesX[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1],
+			__local float4 samplesY[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1],
+			uint blockX,
+			uint blockY
+)
+{
+  uint channel = get_global_id(2) + 1;
+
+#if NR_STATIONS % NR_STATIONS_PER_BLOCK == 0
+  uint firstStationX = blockX * NR_STATIONS_PER_BLOCK;
+  uint firstStationY = blockY * NR_STATIONS_PER_BLOCK;
+#else
+  uint firstStationX = (blockX - 1) * NR_STATIONS_PER_BLOCK + NR_STATIONS % NR_STATIONS_PER_BLOCK;
+  int  firstStationY = (blockY - 1) * NR_STATIONS_PER_BLOCK + NR_STATIONS % NR_STATIONS_PER_BLOCK;
+#endif
+
+  uint statXoffset = get_local_id(0) / (NR_STATIONS_PER_BLOCK / 2);
+  uint statYoffset = get_local_id(0) % (NR_STATIONS_PER_BLOCK / 2);
+
+  float4 vis_0A_r = 0, vis_0A_i = 0;
+  float4 vis_0B_r = 0, vis_0B_i = 0;
+  float4 vis_1A_r = 0, vis_1A_i = 0;
+  float4 vis_1B_r = 0, vis_1B_i = 0;
+
+  uint loadTime = get_local_id(0) % NR_TIMES_PER_BLOCK;
+  uint loadStat = get_local_id(0) / NR_TIMES_PER_BLOCK;
+
+  bool doCorrelateLower = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + 2 * statYoffset) >= 0;
+  bool doCorrelateUpper = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + 2 * statYoffset) >= -1;
+  bool doLoadY     = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + loadStat) >= 0;
+
+  for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += NR_TIMES_PER_BLOCK) {
+    // load data into local memory
+    float4 sampleX = (*correctedData)[firstStationX + loadStat][channel][major + loadTime];
+    float4 sampleY;
+
+    if (doLoadY)
+      sampleY = (*correctedData)[firstStationY + loadStat][channel][major + loadTime];
 
     barrier(CLK_LOCAL_MEM_FENCE);
+
+    samplesX[loadStat % 2][loadTime][loadStat / 2] = sampleX;
+
+    if (doLoadY)
+      samplesY[loadStat % 2][loadTime][loadStat / 2] = sampleY;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+#pragma unroll 1
+    for (uint time = 0; time < NR_TIMES_PER_BLOCK; time ++) {
+      float4 sample_0, sample_1, sample_A, sample_B;
+
+      if (doCorrelateLower) {
+        sample_0 = samplesY[0][time][statYoffset];
+      }
+
+      if (doCorrelateUpper) {
+        sample_A = samplesX[0][time][statXoffset];
+        sample_B = samplesX[1][time][statXoffset];
+        sample_1 = samplesY[1][time][statYoffset];
+      }
+
+      if (doCorrelateLower) {
+        vis_0A_r += sample_0.xxzz * sample_A.xzxz;
+        vis_0A_i += sample_0.yyww * sample_A.xzxz;
+        vis_0B_r += sample_0.xxzz * sample_B.xzxz;
+        vis_0B_i += sample_0.yyww * sample_B.xzxz;
+        vis_0A_r += sample_0.yyww * sample_A.ywyw;
+        vis_0A_i -= sample_0.xxzz * sample_A.ywyw;
+        vis_0B_r += sample_0.yyww * sample_B.ywyw;
+        vis_0B_i -= sample_0.xxzz * sample_B.ywyw;
+      }
+
+      if (doCorrelateUpper) {
+        vis_1A_r += sample_1.xxzz * sample_A.xzxz;
+        vis_1A_i += sample_1.yyww * sample_A.xzxz;
+        vis_1B_r += sample_1.xxzz * sample_B.xzxz;
+        vis_1B_i += sample_1.yyww * sample_B.xzxz;
+        vis_1A_r += sample_1.yyww * sample_A.ywyw;
+        vis_1A_i -= sample_1.xxzz * sample_A.ywyw;
+        vis_1B_r += sample_1.yyww * sample_B.ywyw;
+        vis_1B_i -= sample_1.xxzz * sample_B.ywyw;
+      }
+    }
   }
 
   int  statY = firstStationY + 2 * statYoffset;
   uint statX = firstStationX + 2 * statXoffset;
   uint baseline = (statX * (statX + 1) / 2) + statY;
 
-  if (NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || statY >= 0) {
-    (*visibilities)[baseline    ][channel].even = vis_0A_r;
-    (*visibilities)[baseline    ][channel].odd  = vis_0A_i;
-    (*visibilities)[baseline + 1][channel].even = vis_1A_r;
-    (*visibilities)[baseline + 1][channel].odd  = vis_1A_i;
-  }
-
-  if (NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || statY >= -1) {
+  if (doCorrelateLower) {
+    (*visibilities)[baseline	        ][channel].even = vis_0A_r;
+    (*visibilities)[baseline	        ][channel].odd  = vis_0A_i;
     (*visibilities)[baseline + statX + 1][channel].even = vis_0B_r;
     (*visibilities)[baseline + statX + 1][channel].odd  = vis_0B_i;
+  }
+
+  if (doCorrelateUpper) {
+    (*visibilities)[baseline	     + 1][channel].even = vis_1A_r;
+    (*visibilities)[baseline	     + 1][channel].odd  = vis_1A_i;
     (*visibilities)[baseline + statX + 2][channel].even = vis_1B_r;
     (*visibilities)[baseline + statX + 2][channel].odd  = vis_1B_i;
   }
 }
+
+
+__kernel __attribute__((reqd_work_group_size(NR_STATIONS_PER_BLOCK * NR_STATIONS_PER_BLOCK / 4, 1, 1)))
+void correlate(__global void *visibilitiesPtr,
+	       __global const void *correctedDataPtr)
+{
+  __local float4 samplesX[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1];
+  __local float4 samplesY[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1];
+
+  uint block	 = get_global_id(1);
+  uint blockX	 = convert_uint_rtz(sqrt(convert_float(8 * block + 1)) - 0.99999f) / 2;
+  uint blockY	 = block - blockX * (blockX + 1) / 2;
+
+  if (blockX == blockY)
+    correlateTriangle2((VisibilitiesType) visibilitiesPtr, (CorrectedDataType) correctedDataPtr, samplesX, blockX);
+  else
+    correlateRectangle((VisibilitiesType) visibilitiesPtr, (CorrectedDataType) correctedDataPtr, samplesX, samplesY, blockX, blockY);
+}
diff --git a/RTCP/GPUProc/src/NewCorrelator.cl-0.ptx b/RTCP/GPUProc/src/NewCorrelator.cl-0.ptx
deleted file mode 100644
index c69f71f47ba5748d5cb342ecbed29c627f766f0f..0000000000000000000000000000000000000000
Binary files a/RTCP/GPUProc/src/NewCorrelator.cl-0.ptx and /dev/null differ
diff --git a/RTCP/GPUProc/src/RTCP.cc b/RTCP/GPUProc/src/RTCP.cc
index 0e6c8b99b6746c48388c67fd96eb40a36d5e0329..9c6a8830cdc4fcc45bba57c306ff434c9b3f56e9 100644
--- a/RTCP/GPUProc/src/RTCP.cc
+++ b/RTCP/GPUProc/src/RTCP.cc
@@ -14,11 +14,11 @@
 #include <iostream>
 #include <sstream>
 #include <boost/multi_array.hpp>
+#include <boost/date_time/posix_time/posix_time.hpp>
 
 #include "Align.h"
-#include "BandPass.h"
 #include "ApplCommon/PosixTime.h"
-#include <boost/date_time/posix_time/posix_time.hpp>
+#include "BandPass.h"
 #include "Common/LofarLogger.h"
 #include "Common/SystemUtil.h"
 #include "FilterBank.h"
@@ -30,6 +30,11 @@
 #include "UHEP/InvertedStationPPFWeights.h"
 //#include "clAmdFft/include/clAmdFft.h"
 
+#if defined __linux__
+#include <sched.h>
+#include <sys/time.h>
+#endif
+
 namespace LOFAR {
 namespace RTCP {
 
@@ -41,9 +46,7 @@ unsigned nrGPUs;
 #define NR_TAPS			16
 #define NR_STATION_FILTER_TAPS	16
 
-// the SAP to process (we support only one SAP for now)
-#define SAP 0
-
+#undef USE_NEW_CORRELATOR
 #define USE_2X2
 #undef USE_CUSTOM_FFT
 #undef USE_TEST_DATA
@@ -81,6 +84,40 @@ double getTime()
   return now - firstTime;
 }
 
+#if defined __linux__
+
+inline void set_affinity(unsigned device)
+{
+#if 0
+  static const char mapping[1][12] = {
+     0,  1,  2,  3,  8,  9, 10, 11,
+  };
+#else
+  static const char mapping[8][12] = {
+     { 0,  1,  2,  3,  4,  5, 12, 13, 14, 15, 16, 17, },
+     { 0,  1,  2,  3,  4,  5, 12, 13, 14, 15, 16, 17, },
+     { 0,  1,  2,  3,  4,  5, 12, 13, 14, 15, 16, 17, },
+     { 0,  1,  2,  3,  4,  5, 12, 13, 14, 15, 16, 17, },
+     { 6,  7,  8,  9, 10, 11, 18, 19, 20, 21, 22, 23, },
+     { 6,  7,  8,  9, 10, 11, 18, 19, 20, 21, 22, 23, },
+     { 6,  7,  8,  9, 10, 11, 18, 19, 20, 21, 22, 23, },
+     { 6,  7,  8,  9, 10, 11, 18, 19, 20, 21, 22, 23, },
+  };
+#endif
+
+  cpu_set_t set;
+
+  CPU_ZERO(&set);
+
+  for (unsigned coreIndex = 0; coreIndex < 12; coreIndex ++)
+    CPU_SET(mapping[device][coreIndex], &set);
+
+  if (sched_setaffinity(0, sizeof set, &set) < 0)
+    perror("sched_setaffinity");
+}
+
+#endif
+
 
 class PerformanceCounter
 {
@@ -200,7 +237,7 @@ cl::Program createProgram(const Parset &ps, cl::Context &context, std::vector<cl
   args << " -DNR_SAMPLES_PER_CHANNEL=" << ps.nrSamplesPerChannel();
   args << " -DNR_SAMPLES_PER_SUBBAND=" << ps.nrSamplesPerSubband();
   args << " -DNR_BEAMS=" << ps.nrBeams();
-  args << " -DNR_TABS=" << ps.nrTABs(SAP);
+  args << " -DNR_TABS=" << ps.nrTABs(0);
   args << " -DNR_COHERENT_STOKES=" << ps.nrCoherentStokes();
   args << " -DNR_INCOHERENT_STOKES=" << ps.nrIncoherentStokes();
   args << " -DCOHERENT_STOKES_TIME_INTEGRATION_FACTOR=" << ps.coherentStokesTimeIntegrationFactor();
@@ -257,6 +294,10 @@ class Pipeline
     SmartPtr<InputSection<i16complex> > inputSection16;
     SmartPtr<InputSection<i8complex> >	inputSection8;
     SmartPtr<InputSection<i4complex> >	inputSection4;
+
+#if defined USE_B7015
+    OMP_Lock hostToDeviceLock[4], deviceToHostLock[4];
+#endif
 };
 
 
@@ -273,12 +314,12 @@ class CorrelatorPipeline : public Pipeline
     FilterBank		    filterBank;
 
     cl::Program		    firFilterProgram, delayAndBandPassProgram, correlatorProgram;
+#if defined USE_NEW_CORRELATOR
+    PerformanceCounter	    firFilterCounter, delayAndBandPassCounter, correlateTriangleCounter, correlateRectangleCounter, fftCounter;
+#else
     PerformanceCounter	    firFilterCounter, delayAndBandPassCounter, correlatorCounter, fftCounter;
-    PerformanceCounter	    samplesCounter, visibilitiesCounter;
-
-#if defined USE_B7015
-    OMP_Lock hostToDeviceLock[4], deviceToHostLock[4];
 #endif
+    PerformanceCounter	    samplesCounter, visibilitiesCounter;
 };
 
 
@@ -332,7 +373,12 @@ CorrelatorPipeline::CorrelatorPipeline(const Parset &ps)
   filterBank(true, NR_TAPS, ps.nrChannelsPerSubband(), KAISER),
   firFilterCounter("FIR filter"),
   delayAndBandPassCounter("delay/bp"),
+#if defined USE_NEW_CORRELATOR
+  correlateTriangleCounter("cor.triangle"),
+  correlateRectangleCounter("cor.rectangle"),
+#else
   correlatorCounter("correlator"),
+#endif
   fftCounter("FFT"),
   samplesCounter("samples"),
   visibilitiesCounter("visibilities")
@@ -348,8 +394,11 @@ CorrelatorPipeline::CorrelatorPipeline(const Parset &ps)
 #pragma omp section
     delayAndBandPassProgram = createProgram("DelayAndBandPass.cl");
 #pragma omp section
+#if defined USE_NEW_CORRELATOR
     correlatorProgram = createProgram("NewCorrelator.cl");
-    //correlatorProgram = createProgram("Correlator.cl");
+#else
+    correlatorProgram = createProgram("Correlator.cl");
+#endif
   }
 
   std::cout << "compile time = " << getTime() - startTime << std::endl;
@@ -666,7 +715,7 @@ class DelayAndBandPassKernel : public Kernel
 };
 
 
-#if 0
+#if !defined USE_NEW_CORRELATOR
 
 class CorrelatorKernel : public Kernel
 {
@@ -714,12 +763,13 @@ class CorrelatorKernel : public Kernel
       nrThreads = (nrThreads + preferredMultiple - 1) / preferredMultiple * preferredMultiple;
       //std::cout << "nrBlocks = " << nrBlocks << ", nrPasses = " << nrPasses << ", preferredMultiple = " << preferredMultiple << ", nrThreads = " << nrThreads << std::endl;
 
-      globalWorkSize = cl::NDRange(nrPasses * nrThreads, ps.nrChannelsPerSubband());
+      unsigned nrUsableChannels = std::max(ps.nrChannelsPerSubband() - 1, 1U);
+      globalWorkSize = cl::NDRange(nrPasses * nrThreads, nrUsableChannels);
       localWorkSize  = cl::NDRange(nrThreads, 1);
 
-      nrOperations   = (size_t) ps.nrChannelsPerSubband() * ps.nrBaselines() * ps.nrSamplesPerChannel() * 32;
-      nrBytesRead    = (size_t) nrPasses * ps.nrStations() * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>);
-      nrBytesWritten = (size_t) ps.nrBaselines() * ps.nrChannelsPerSubband() * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>);
+      nrOperations   = (size_t) nrUsableChannels * ps.nrBaselines() * ps.nrSamplesPerChannel() * 32;
+      nrBytesRead    = (size_t) nrPasses * ps.nrStations() * nrUsableChannels * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>);
+      nrBytesWritten = (size_t) ps.nrBaselines() * nrUsableChannels * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>);
     }
 };
 
@@ -731,7 +781,7 @@ class CorrelatorKernel : public Kernel
     CorrelatorKernel(const Parset &ps, cl::CommandQueue &queue, cl::Program &program, cl::Buffer &devVisibilities, cl::Buffer &devCorrectedData)
     :
 #if defined USE_2X2
-      Kernel(ps, program, "correlateRectangles")
+      Kernel(ps, program, "correlate")
 #else
 #error not implemented
 #endif
@@ -739,17 +789,90 @@ class CorrelatorKernel : public Kernel
       setArg(0, devVisibilities);
       setArg(1, devCorrectedData);
 
-      unsigned nrRectanglesPerSide = ((ps.nrStations() - 1) / (2 * 16));
+      unsigned nrRectanglesPerSide = (ps.nrStations() - 1) / (2 * 16);
+      unsigned nrRectangles = nrRectanglesPerSide * (nrRectanglesPerSide + 1) / 2;
+//#pragma omp critical (cout)
+      //std::cout << "nrRectangles = " << nrRectangles << std::endl;
+
+      unsigned nrBlocksPerSide = (ps.nrStations() + 2 * 16 - 1) / (2 * 16);
+      unsigned nrBlocks	       = nrBlocksPerSide * (nrBlocksPerSide + 1) / 2;
+//#pragma omp critical (cout)
+      //std::cout << "nrBlocks = " << nrBlocks << std::endl;
+
+      unsigned nrUsableChannels = std::max(ps.nrChannelsPerSubband() - 1, 1U);
+      globalWorkSize = cl::NDRange(16 * 16, nrBlocks, nrUsableChannels);
+      localWorkSize  = cl::NDRange(16 * 16, 1, 1);
+
+      // FIXME
+      //nrOperations   = (size_t) (32 * 32) * nrRectangles * nrUsableChannels * ps.nrSamplesPerChannel() * 32;
+      nrOperations   = (size_t) ps.nrBaselines() * ps.nrSamplesPerSubband() * 32;
+      nrBytesRead    = (size_t) (32 + 32) * nrRectangles * nrUsableChannels * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>);
+      nrBytesWritten = (size_t) (32 * 32) * nrRectangles * nrUsableChannels * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>);
+    }
+};
+
+
+class CorrelateRectangleKernel : public Kernel
+{
+  public:
+    CorrelateRectangleKernel(const Parset &ps, cl::CommandQueue &queue, cl::Program &program, cl::Buffer &devVisibilities, cl::Buffer &devCorrectedData)
+    :
+#if defined USE_2X2
+      Kernel(ps, program, "correlateRectangleKernel")
+#else
+#error not implemented
+#endif
+    {
+      setArg(0, devVisibilities);
+      setArg(1, devCorrectedData);
+
+      unsigned nrRectanglesPerSide = (ps.nrStations() - 1) / (2 * 16);
       unsigned nrRectangles = nrRectanglesPerSide * (nrRectanglesPerSide + 1) / 2;
 #pragma omp critical (cout)
       std::cout << "nrRectangles = " << nrRectangles << std::endl;
 
-      globalWorkSize = cl::NDRange(16 * 16, nrRectangles, ps.nrChannelsPerSubband());
+      unsigned nrUsableChannels = std::max(ps.nrChannelsPerSubband() - 1, 1U);
+      globalWorkSize = cl::NDRange(16 * 16, nrRectangles, nrUsableChannels);
       localWorkSize  = cl::NDRange(16 * 16, 1, 1);
 
-      nrOperations   = (size_t) (32 * 32) * nrRectangles * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * 32;
-      nrBytesRead    = (size_t) (32 + 32) * nrRectangles * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>);
-      nrBytesWritten = (size_t) (32 * 32) * nrRectangles * ps.nrChannelsPerSubband() * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>);
+      nrOperations   = (size_t) (32 * 32) * nrRectangles * nrUsableChannels * ps.nrSamplesPerChannel() * 32;
+      nrBytesRead    = (size_t) (32 + 32) * nrRectangles * nrUsableChannels * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>);
+      nrBytesWritten = (size_t) (32 * 32) * nrRectangles * nrUsableChannels * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>);
+    }
+};
+
+
+class CorrelateTriangleKernel : public Kernel
+{
+  public:
+    CorrelateTriangleKernel(const Parset &ps, cl::CommandQueue &queue, cl::Program &program, cl::Buffer &devVisibilities, cl::Buffer &devCorrectedData)
+    :
+#if defined USE_2X2
+      Kernel(ps, program, "correlateTriangleKernel")
+#else
+#error not implemented
+#endif
+    {
+      setArg(0, devVisibilities);
+      setArg(1, devCorrectedData);
+
+      unsigned nrTriangles = (ps.nrStations() + 2 * 16 - 1) / (2 * 16);
+      unsigned nrMiniBlocksPerSide = 16;
+      unsigned nrMiniBlocks = nrMiniBlocksPerSide * (nrMiniBlocksPerSide + 1) / 2;
+      size_t   preferredMultiple;
+      getWorkGroupInfo(queue.getInfo<CL_QUEUE_DEVICE>(), CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &preferredMultiple);
+      unsigned nrThreads = align(nrMiniBlocks, preferredMultiple);
+
+#pragma omp critical (cout)
+      std::cout << "nrTriangles = " << nrTriangles << ", nrMiniBlocks = " << nrMiniBlocks << ", nrThreads = " << nrThreads << std::endl;
+
+      unsigned nrUsableChannels = std::max(ps.nrChannelsPerSubband() - 1, 1U);
+      globalWorkSize = cl::NDRange(nrThreads, nrTriangles, nrUsableChannels);
+      localWorkSize  = cl::NDRange(nrThreads, 1, 1);
+
+      nrOperations   = (size_t) (32 * 32 / 2) * nrTriangles * nrUsableChannels * ps.nrSamplesPerChannel() * 32;
+      nrBytesRead    = (size_t) 32 * nrTriangles * nrUsableChannels * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>);
+      nrBytesWritten = (size_t) (32 * 32 / 2) * nrTriangles * nrUsableChannels * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>);
     }
 };
 
@@ -815,18 +938,18 @@ class BeamFormerKernel : public Kernel
       setArg(1, devCorrectedData);
       setArg(2, devBeamFormerWeights);
 
-      globalWorkSize = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(SAP), ps.nrChannelsPerSubband());
-      localWorkSize  = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(SAP), 1);
+      globalWorkSize = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(0), ps.nrChannelsPerSubband());
+      localWorkSize  = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(0), 1);
 
       // FIXME: nrTABs
-      //queue.enqueueNDRangeKernel(*this, cl::NullRange, cl::NDRange(16, ps.nrTABs(SAP), ps.nrChannelsPerSubband()), cl::NDRange(16, ps.nrTABs(SAP), 1), 0, &event);
+      //queue.enqueueNDRangeKernel(*this, cl::NullRange, cl::NDRange(16, ps.nrTABs(0), ps.nrChannelsPerSubband()), cl::NDRange(16, ps.nrTABs(0), 1), 0, &event);
 
       size_t count = ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * NR_POLARIZATIONS;
-      size_t nrWeightsBytes = ps.nrStations() * ps.nrTABs(SAP) * ps.nrChannelsPerSubband() * NR_POLARIZATIONS * sizeof(std::complex<float>);
+      size_t nrWeightsBytes = ps.nrStations() * ps.nrTABs(0) * ps.nrChannelsPerSubband() * NR_POLARIZATIONS * sizeof(std::complex<float>);
       size_t nrSampleBytesPerPass = count * ps.nrStations() * sizeof(std::complex<float>);
-      size_t nrComplexVoltagesBytesPerPass = count * ps.nrTABs(SAP) * sizeof(std::complex<float>);
+      size_t nrComplexVoltagesBytesPerPass = count * ps.nrTABs(0) * sizeof(std::complex<float>);
       unsigned nrPasses = std::max((ps.nrStations() + 6) / 16, 1U);
-      nrOperations   = count * ps.nrStations() * ps.nrTABs(SAP) * 8;
+      nrOperations   = count * ps.nrStations() * ps.nrTABs(0) * 8;
       nrBytesRead    = nrWeightsBytes + nrSampleBytesPerPass + (nrPasses - 1) * nrComplexVoltagesBytesPerPass;
       nrBytesWritten = nrPasses * nrComplexVoltagesBytesPerPass;
     }
@@ -844,14 +967,14 @@ class BeamFormerTransposeKernel : public Kernel
       setArg(0, devTransposedData);
       setArg(1, devComplexVoltages);
 
-      //globalWorkSize = cl::NDRange(256, (ps.nrTABs(SAP) + 15) / 16, (ps.nrChannelsPerSubband() + 15) / 16);
-      globalWorkSize = cl::NDRange(256, (ps.nrTABs(SAP) + 15) / 16, ps.nrSamplesPerChannel() / 16);
+      //globalWorkSize = cl::NDRange(256, (ps.nrTABs(0) + 15) / 16, (ps.nrChannelsPerSubband() + 15) / 16);
+      globalWorkSize = cl::NDRange(256, (ps.nrTABs(0) + 15) / 16, ps.nrSamplesPerChannel() / 16);
       localWorkSize  = cl::NDRange(256, 1, 1);
 
       nrOperations   = 0;
-      nrBytesRead    = (size_t) ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(SAP) * NR_POLARIZATIONS * sizeof(std::complex<float>),
-      //nrBytesWritten = (size_t) ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrSamplesPerChannel() * ps.nrChannelsPerSubband() * sizeof(std::complex<float>);
-      nrBytesWritten = (size_t) ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * sizeof(std::complex<float>);
+      nrBytesRead    = (size_t) ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(0) * NR_POLARIZATIONS * sizeof(std::complex<float>),
+      //nrBytesWritten = (size_t) ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrSamplesPerChannel() * ps.nrChannelsPerSubband() * sizeof(std::complex<float>);
+      nrBytesWritten = (size_t) ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * sizeof(std::complex<float>);
     }
 };
 
@@ -871,7 +994,7 @@ class Dedispersion_FFT_Kernel
 
     void enqueue(cl::CommandQueue &queue, PerformanceCounter &counter, clFFT_Direction direction)
     {
-      size_t nrFFTs = (size_t) ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() / ps.dedispersionFFTsize();
+      size_t nrFFTs = (size_t) ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() / ps.dedispersionFFTsize();
 
       cl_int error = clFFT_ExecuteInterleaved(queue(), plan.plan, nrFFTs, direction, buffer(), buffer(), 0, 0, &event());
 
@@ -896,7 +1019,7 @@ class DedispersionForwardFFTkernel : public FFT_Kernel
   public:
     DedispersionForwardFFTkernel(const Parset &ps, cl::Context &context, cl::Buffer &buffer)
     :
-      FFT_Kernel(context, ps.dedispersionFFTsize(), ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() / ps.dedispersionFFTsize(), true, buffer)
+      FFT_Kernel(context, ps.dedispersionFFTsize(), ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() / ps.dedispersionFFTsize(), true, buffer)
     {
       ASSERT(ps.nrSamplesPerChannel() % ps.dedispersionFFTsize() == 0);
     }
@@ -908,7 +1031,7 @@ class DedispersionBackwardFFTkernel : public FFT_Kernel
   public:
     DedispersionBackwardFFTkernel(const Parset &ps, cl::Context &context, cl::Buffer &buffer)
     :
-      FFT_Kernel(context, ps.dedispersionFFTsize(), ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() / ps.dedispersionFFTsize(), false, buffer)
+      FFT_Kernel(context, ps.dedispersionFFTsize(), ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() / ps.dedispersionFFTsize(), false, buffer)
     {
       ASSERT(ps.nrSamplesPerChannel() % ps.dedispersionFFTsize() == 0);
     }
@@ -946,8 +1069,8 @@ class DedispersionChirpKernel : public Kernel
 	//std::cout << "localWorkSize = NDRange(" << fftSize / divisor << ", 1, 1))" << std::endl;
       }
 
-      nrOperations = (size_t) NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * (9 * ps.nrTABs(SAP) + 17),
-      nrBytesRead  = nrBytesWritten = sizeof(std::complex<float>) * ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel();
+      nrOperations = (size_t) NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * (9 * ps.nrTABs(0) + 17),
+      nrBytesRead  = nrBytesWritten = sizeof(std::complex<float>) * ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel();
     }
 
     void enqueue(cl::CommandQueue &queue, PerformanceCounter &counter, double subbandFrequency)
@@ -970,12 +1093,12 @@ class CoherentStokesKernel : public Kernel
       setArg(0, devStokesData);
       setArg(1, devComplexVoltages);
 
-      globalWorkSize = cl::NDRange(256, (ps.nrTABs(SAP) + 15) / 16, (ps.nrChannelsPerSubband() + 15) / 16);
+      globalWorkSize = cl::NDRange(256, (ps.nrTABs(0) + 15) / 16, (ps.nrChannelsPerSubband() + 15) / 16);
       localWorkSize  = cl::NDRange(256, 1, 1);
 
-      nrOperations   = (size_t) ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(SAP) * (ps.nrCoherentStokes() == 1 ? 8 : 20 + 2.0 / ps.coherentStokesTimeIntegrationFactor());
-      nrBytesRead    = (size_t) ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(SAP) * NR_POLARIZATIONS * sizeof(std::complex<float>);
-      nrBytesWritten = (size_t) ps.nrTABs(SAP) * ps.nrCoherentStokes() * ps.nrSamplesPerChannel() / ps.coherentStokesTimeIntegrationFactor() * ps.nrChannelsPerSubband() * sizeof(float);
+      nrOperations   = (size_t) ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(0) * (ps.nrCoherentStokes() == 1 ? 8 : 20 + 2.0 / ps.coherentStokesTimeIntegrationFactor());
+      nrBytesRead    = (size_t) ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(0) * NR_POLARIZATIONS * sizeof(std::complex<float>);
+      nrBytesWritten = (size_t) ps.nrTABs(0) * ps.nrCoherentStokes() * ps.nrSamplesPerChannel() / ps.coherentStokesTimeIntegrationFactor() * ps.nrChannelsPerSubband() * sizeof(float);
     }
 };
 
@@ -992,31 +1115,31 @@ class UHEP_BeamFormerKernel : public Kernel
       setArg(2, devBeamFormerWeights);
 
 #if 1
-      globalWorkSize = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(SAP), ps.nrSubbands());
-      localWorkSize  = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(SAP), 1);
+      globalWorkSize = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(0), ps.nrSubbands());
+      localWorkSize  = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(0), 1);
 
       size_t count = ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * NR_POLARIZATIONS;
-      size_t nrWeightsBytes = ps.nrStations() * ps.nrTABs(SAP) * ps.nrSubbands() * NR_POLARIZATIONS * sizeof(std::complex<float>);
+      size_t nrWeightsBytes = ps.nrStations() * ps.nrTABs(0) * ps.nrSubbands() * NR_POLARIZATIONS * sizeof(std::complex<float>);
       size_t nrSampleBytes = count * ps.nrStations() * ps.nrBytesPerComplexSample();
-      size_t nrComplexVoltagesBytesPerPass = count * ps.nrTABs(SAP) * sizeof(std::complex<float>);
+      size_t nrComplexVoltagesBytesPerPass = count * ps.nrTABs(0) * sizeof(std::complex<float>);
       unsigned nrPasses = std::max((ps.nrStations() + 6) / 16, 1U);
-      nrOperations   = count * ps.nrStations() * ps.nrTABs(SAP) * 8;
+      nrOperations   = count * ps.nrStations() * ps.nrTABs(0) * 8;
       nrBytesRead    = nrWeightsBytes + nrSampleBytes + (nrPasses - 1) * nrComplexVoltagesBytesPerPass;
       nrBytesWritten = nrPasses * nrComplexVoltagesBytesPerPass;
 #else
-      ASSERT(ps.nrTABs(SAP) % 3 == 0);
+      ASSERT(ps.nrTABs(0) % 3 == 0);
       ASSERT(ps.nrStations() % 6 == 0);
-      unsigned nrThreads = NR_POLARIZATIONS * (ps.nrTABs(SAP) / 3) * (ps.nrStations() / 6);
+      unsigned nrThreads = NR_POLARIZATIONS * (ps.nrTABs(0) / 3) * (ps.nrStations() / 6);
       globalWorkSize = cl::NDRange(nrThreads, ps.nrSubbands());
       localWorkSize  = cl::NDRange(nrThreads, 1);
-      //globalWorkSize = cl::NDRange(ps.nrStations() / 6, ps.nrTABs(SAP) / 3, ps.nrSubbands());
-      //localWorkSize  = cl::NDRange(ps.nrStations() / 6, ps.nrTABs(SAP) / 3, 1);
+      //globalWorkSize = cl::NDRange(ps.nrStations() / 6, ps.nrTABs(0) / 3, ps.nrSubbands());
+      //localWorkSize  = cl::NDRange(ps.nrStations() / 6, ps.nrTABs(0) / 3, 1);
 
       size_t count = ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * NR_POLARIZATIONS;
-      size_t nrWeightsBytes = ps.nrStations() * ps.nrTABs(SAP) * ps.nrSubbands() * NR_POLARIZATIONS * sizeof(std::complex<float>);
+      size_t nrWeightsBytes = ps.nrStations() * ps.nrTABs(0) * ps.nrSubbands() * NR_POLARIZATIONS * sizeof(std::complex<float>);
       size_t nrSampleBytes = count * ps.nrStations() * ps.nrBytesPerComplexSample();
-      size_t nrComplexVoltagesBytes = count * ps.nrTABs(SAP) * sizeof(std::complex<float>);
-      nrOperations   = count * ps.nrStations() * ps.nrTABs(SAP) * 8;
+      size_t nrComplexVoltagesBytes = count * ps.nrTABs(0) * sizeof(std::complex<float>);
+      nrOperations   = count * ps.nrStations() * ps.nrTABs(0) * 8;
       nrBytesRead    = nrWeightsBytes + nrSampleBytes;
       nrBytesWritten = nrComplexVoltagesBytes;
 #endif
@@ -1035,12 +1158,12 @@ class UHEP_TransposeKernel : public Kernel
       setArg(1, devComplexVoltages);
       setArg(2, devReverseSubbandMapping);
 
-      globalWorkSize = cl::NDRange(256, (ps.nrTABs(SAP) + 15) / 16, 512 / 16);
+      globalWorkSize = cl::NDRange(256, (ps.nrTABs(0) + 15) / 16, 512 / 16);
       localWorkSize  = cl::NDRange(256, 1, 1);
 
       nrOperations   = 0;
-      nrBytesRead    = (size_t) ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * ps.nrTABs(SAP) * NR_POLARIZATIONS * sizeof(std::complex<float>);
-      nrBytesWritten = (size_t) ps.nrTABs(SAP) * NR_POLARIZATIONS * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * 512 * sizeof(std::complex<float>);
+      nrBytesRead    = (size_t) ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * ps.nrTABs(0) * NR_POLARIZATIONS * sizeof(std::complex<float>);
+      nrBytesWritten = (size_t) ps.nrTABs(0) * NR_POLARIZATIONS * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * 512 * sizeof(std::complex<float>);
     }
 };
 
@@ -1055,10 +1178,10 @@ class UHEP_InvFFT_Kernel : public Kernel
       setArg(0, devFFTedData);
       setArg(1, devFFTedData);
 
-      globalWorkSize = cl::NDRange(128, ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrSamplesPerChannel());
+      globalWorkSize = cl::NDRange(128, ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrSamplesPerChannel());
       localWorkSize  = cl::NDRange(128, 1);
 
-      size_t nrFFTs = (size_t) ps.nrTABs(SAP) * NR_POLARIZATIONS * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1);
+      size_t nrFFTs = (size_t) ps.nrTABs(0) * NR_POLARIZATIONS * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1);
       nrOperations   = nrFFTs * 5 * 1024 * 10;
       nrBytesRead    = nrFFTs * 512 * sizeof(std::complex<float>);
       nrBytesWritten = nrFFTs * 1024 * sizeof(float);
@@ -1083,10 +1206,10 @@ class UHEP_InvFIR_Kernel : public Kernel
       for (nrThreads = 1024; nrThreads > maxNrThreads; nrThreads /= 2)
 	;
 
-      globalWorkSize = cl::NDRange(1024, NR_POLARIZATIONS, ps.nrTABs(SAP));
+      globalWorkSize = cl::NDRange(1024, NR_POLARIZATIONS, ps.nrTABs(0));
       localWorkSize  = cl::NDRange(nrThreads, 1, 1);
 
-      size_t count = ps.nrTABs(SAP) * NR_POLARIZATIONS * 1024;
+      size_t count = ps.nrTABs(0) * NR_POLARIZATIONS * 1024;
       nrOperations   = count * ps.nrSamplesPerChannel() * NR_STATION_FILTER_TAPS * 2;
       nrBytesRead    = count * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * sizeof(float);
       nrBytesWritten = count * ps.nrSamplesPerChannel() * sizeof(float);
@@ -1104,12 +1227,12 @@ class UHEP_TriggerKernel : public Kernel
       setArg(0, devTriggerInfo);
       setArg(1, devInvFIRfilteredData);
 
-      globalWorkSize = cl::NDRange(16, 16, ps.nrTABs(SAP));
+      globalWorkSize = cl::NDRange(16, 16, ps.nrTABs(0));
       localWorkSize  = cl::NDRange(16, 16, 1);
 
-      nrOperations   = (size_t) ps.nrTABs(SAP) * ps.nrSamplesPerChannel() * 1024 * (3 /* power */ + 2 /* window */ + 1 /* max */ + 7 /* mean/variance */);
-      nrBytesRead    = (size_t) ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrSamplesPerChannel() * 1024 * sizeof(float);
-      nrBytesWritten = (size_t) ps.nrTABs(SAP) * sizeof(TriggerInfo);
+      nrOperations   = (size_t) ps.nrTABs(0) * ps.nrSamplesPerChannel() * 1024 * (3 /* power */ + 2 /* window */ + 1 /* max */ + 7 /* mean/variance */);
+      nrBytesRead    = (size_t) ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrSamplesPerChannel() * 1024 * sizeof(float);
+      nrBytesWritten = (size_t) ps.nrTABs(0) * sizeof(TriggerInfo);
     }
 };
 
@@ -1143,6 +1266,8 @@ CorrelatorWorkQueue::CorrelatorWorkQueue(CorrelatorPipeline &pipeline)
   inputSamples(boost::extents[ps.nrStations()][(ps.nrSamplesPerChannel() + NR_TAPS - 1) * ps.nrChannelsPerSubband()][NR_POLARIZATIONS][ps.nrBytesPerComplexSample()], queue, CL_MEM_WRITE_ONLY, devBufferA),
  visibilities(boost::extents[ps.nrBaselines()][ps.nrChannelsPerSubband()][NR_POLARIZATIONS][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, devBufferB)
 {
+  memset(inputSamples.origin(), 0, inputSamples.bytesize()); // FIXME
+  memset(visibilities.origin(), 0, visibilities.bytesize()); // FIXME
   size_t firWeightsSize = ps.nrChannelsPerSubband() * NR_TAPS * sizeof(float);
   devFIRweights = cl::Buffer(pipeline.context, CL_MEM_READ_ONLY, firWeightsSize);
   queue.enqueueWriteBuffer(devFIRweights, CL_TRUE, 0, ps.nrChannelsPerSubband() * NR_TAPS * sizeof(float), pipeline.filterBank.getWeights().origin());
@@ -1168,7 +1293,12 @@ void CorrelatorWorkQueue::doWork()
   FIR_FilterKernel firFilterKernel(ps, queue, pipeline.firFilterProgram, devFilteredData, inputSamples, devFIRweights);
   Filter_FFT_Kernel fftKernel(ps, pipeline.context, devFilteredData);
   DelayAndBandPassKernel delayAndBandPassKernel(ps, pipeline.delayAndBandPassProgram, devCorrectedData, devFilteredData, delaysAtBegin, delaysAfterEnd, phaseOffsets, bandPassCorrectionWeights);
+#if defined USE_NEW_CORRELATOR
+  CorrelateTriangleKernel correlateTriangleKernel(ps, queue, pipeline.correlatorProgram, visibilities, devCorrectedData);
+  CorrelateRectangleKernel correlateRectangleKernel(ps, queue, pipeline.correlatorProgram, visibilities, devCorrectedData);
+#else
   CorrelatorKernel correlatorKernel(ps, queue, pipeline.correlatorProgram, visibilities, devCorrectedData);
+#endif
   double startTime = ps.startTime(), currentTime, stopTime = ps.stopTime(), blockTime = ps.CNintegrationTime();
 
 #pragma omp barrier
@@ -1176,7 +1306,7 @@ void CorrelatorWorkQueue::doWork()
   double executionStartTime = getTime();
 
   for (unsigned block = 0; (currentTime = startTime + block * blockTime) < stopTime; block ++) {
-#pragma omp single
+#pragma omp single nowait
 #pragma omp critical (cout)
     std::cout << "block = " << block << ", time = " << to_simple_string(from_ustime_t(currentTime)) << std::endl;
 
@@ -1185,17 +1315,14 @@ void CorrelatorWorkQueue::doWork()
     memset(phaseOffsets.origin(), 0, phaseOffsets.bytesize());
 
     // FIXME!!!
-    if (ps.nrStations() >= 3)
-      delaysAtBegin[0][2][0] = 1e-6, delaysAfterEnd[0][2][0] = 1.1e-6;
+    //if (ps.nrStations() >= 3)
+      //delaysAtBegin[0][2][0] = 1e-6, delaysAfterEnd[0][2][0] = 1.1e-6;
 
     delaysAtBegin.hostToDevice(CL_FALSE);
     delaysAfterEnd.hostToDevice(CL_FALSE);
     phaseOffsets.hostToDevice(CL_FALSE);
-    queue.finish();
 
-#pragma omp barrier
-
-#pragma omp for schedule(dynamic)
+#pragma omp for schedule(dynamic), nowait
     for (unsigned subband = 0; subband < ps.nrSubbands(); subband ++) {
       try {
 #if defined USE_TEST_DATA
@@ -1217,7 +1344,12 @@ void CorrelatorWorkQueue::doWork()
 	}
 
 	delayAndBandPassKernel.enqueue(queue, pipeline.delayAndBandPassCounter, subband);
+#if defined USE_NEW_CORRELATOR
+	correlateTriangleKernel.enqueue(queue, pipeline.correlateTriangleCounter);
+	correlateRectangleKernel.enqueue(queue, pipeline.correlateRectangleCounter);
+#else
 	correlatorKernel.enqueue(queue, pipeline.correlatorCounter);
+#endif
 	queue.finish();
 
 	{
@@ -1276,11 +1408,11 @@ BeamFormerWorkQueue::BeamFormerWorkQueue(BeamFormerPipeline &pipeline)
   delaysAfterEnd(boost::extents[ps.nrBeams()][ps.nrStations()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY),
   phaseOffsets(boost::extents[ps.nrBeams()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY),
   devCorrectedData(cl::Buffer(pipeline.context, CL_MEM_READ_WRITE, ps.nrStations() * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>))),
-  beamFormerWeights(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrTABs(SAP)], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY),
-  devComplexVoltages(cl::Buffer(pipeline.context, CL_MEM_READ_WRITE, ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(SAP) * NR_POLARIZATIONS * sizeof(std::complex<float>))),
-  //transposedComplexVoltages(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrSamplesPerChannel()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE)
-  transposedComplexVoltages(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE),
-  DMs(boost::extents[ps.nrTABs(SAP)], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY)
+  beamFormerWeights(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrTABs(0)], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY),
+  devComplexVoltages(cl::Buffer(pipeline.context, CL_MEM_READ_WRITE, ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(0) * NR_POLARIZATIONS * sizeof(std::complex<float>))),
+  //transposedComplexVoltages(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrSamplesPerChannel()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE)
+  transposedComplexVoltages(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE),
+  DMs(boost::extents[ps.nrTABs(0)], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY)
 {
   if (ps.correctBandPass()) {
     BandPass::computeCorrectionFactors(bandPassCorrectionWeights.origin(), ps.nrChannelsPerSubband());
@@ -1310,7 +1442,7 @@ void BeamFormerWorkQueue::doWork()
   double executionStartTime = getTime();
 
   for (unsigned block = 0; (currentTime = startTime + block * blockTime) < stopTime; block ++) {
-#pragma omp single
+#pragma omp single nowait
 #pragma omp critical (cout)
     std::cout << "block = " << block << ", time = " << to_simple_string(from_ustime_t(currentTime)) << std::endl;
 
@@ -1326,11 +1458,8 @@ void BeamFormerWorkQueue::doWork()
     delaysAfterEnd.hostToDevice(CL_FALSE);
     phaseOffsets.hostToDevice(CL_FALSE);
     beamFormerWeights.hostToDevice(CL_FALSE);
-    queue.finish();
-
-#pragma omp barrier
 
-#pragma omp for schedule(dynamic)
+#pragma omp for schedule(dynamic), nowait
     for (unsigned subband = 0; subband < ps.nrSubbands(); subband ++) {
       try {
 #if 1
@@ -1384,13 +1513,13 @@ UHEP_WorkQueue::UHEP_WorkQueue(UHEP_Pipeline &pipeline)
   WorkQueue(pipeline),
   pipeline(pipeline),
   hostInputSamples(boost::extents[ps.nrStations()][ps.nrSubbands()][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][NR_POLARIZATIONS][ps.nrBytesPerComplexSample()], queue, CL_MEM_WRITE_ONLY),
-  hostBeamFormerWeights(boost::extents[ps.nrStations()][ps.nrSubbands()][ps.nrTABs(SAP)], queue, CL_MEM_WRITE_ONLY),
-  hostTriggerInfo(ps.nrTABs(SAP), queue, CL_MEM_READ_ONLY)
+  hostBeamFormerWeights(boost::extents[ps.nrStations()][ps.nrSubbands()][ps.nrTABs(0)], queue, CL_MEM_WRITE_ONLY),
+  hostTriggerInfo(ps.nrTABs(0), queue, CL_MEM_READ_ONLY)
 {
   size_t inputSamplesSize = ps.nrStations() * ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * NR_POLARIZATIONS * ps.nrBytesPerComplexSample();
-  size_t complexVoltagesSize = ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * ps.nrTABs(SAP) * NR_POLARIZATIONS * sizeof(std::complex<float>);
-  size_t transposedDataSize = ps.nrTABs(SAP) * NR_POLARIZATIONS * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * 512 * sizeof(std::complex<float>);
-  size_t invFIRfilteredDataSize = ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrSamplesPerChannel() * 512 * sizeof(std::complex<float>);
+  size_t complexVoltagesSize = ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * ps.nrTABs(0) * NR_POLARIZATIONS * sizeof(std::complex<float>);
+  size_t transposedDataSize = ps.nrTABs(0) * NR_POLARIZATIONS * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * 512 * sizeof(std::complex<float>);
+  size_t invFIRfilteredDataSize = ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrSamplesPerChannel() * 512 * sizeof(std::complex<float>);
 
   size_t buffer0size = std::max(inputSamplesSize, transposedDataSize);
   size_t buffer1size = std::max(complexVoltagesSize, invFIRfilteredDataSize);
@@ -1398,7 +1527,7 @@ UHEP_WorkQueue::UHEP_WorkQueue(UHEP_Pipeline &pipeline)
   devBuffers[0] = cl::Buffer(pipeline.context, CL_MEM_READ_WRITE, buffer0size);
   devBuffers[1] = cl::Buffer(pipeline.context, CL_MEM_READ_WRITE, buffer1size);
 
-  size_t beamFormerWeightsSize = ps.nrStations() * ps.nrSubbands() * ps.nrTABs(SAP) * sizeof(std::complex<float>);
+  size_t beamFormerWeightsSize = ps.nrStations() * ps.nrSubbands() * ps.nrTABs(0) * sizeof(std::complex<float>);
   devBeamFormerWeights = cl::Buffer(pipeline.context, CL_MEM_READ_ONLY, beamFormerWeightsSize);
 
   devInputSamples = devBuffers[0];
@@ -1409,7 +1538,7 @@ UHEP_WorkQueue::UHEP_WorkQueue(UHEP_Pipeline &pipeline)
   devFFTedData = devBuffers[0];
   devInvFIRfilteredData = devBuffers[1];
 
-  devTriggerInfo = cl::Buffer(pipeline.context, CL_MEM_WRITE_ONLY, ps.nrTABs(SAP) * sizeof(TriggerInfo));
+  devTriggerInfo = cl::Buffer(pipeline.context, CL_MEM_WRITE_ONLY, ps.nrTABs(0) * sizeof(TriggerInfo));
 }
 
 
@@ -1430,12 +1559,12 @@ void UHEP_WorkQueue::doWork(const float * /*delaysAtBegin*/, const float * /*del
 
   double executionStartTime = getTime();
 
-#pragma omp for schedule(dynamic)
+#pragma omp for schedule(dynamic), nowait
   for (unsigned block = 0; block < nrBlocks; block ++) {
     try {
       double currentTime = startTime + block * blockTime;
 
-//#pragma omp single // FIXME: why does the compiler complain here???
+//#pragma omp single nowait // FIXME: why does the compiler complain here???
 #pragma omp critical (cout)
       std::cout << "block = " << block << ", time = " << to_simple_string(from_ustime_t(currentTime)) << std::endl;
 
@@ -1607,18 +1736,21 @@ struct CorrelatorTest : public UnitTest
 {
   CorrelatorTest(const Parset &ps)
   :
-    //UnitTest(ps, "Correlator.cl")
+#if defined USE_NEW_CORRELATOR
     UnitTest(ps, "NewCorrelator.cl")
+#else
+    UnitTest(ps, "Correlator.cl")
+#endif
   {
     if (ps.nrStations() >= 5 && ps.nrChannelsPerSubband() >= 6 && ps.nrSamplesPerChannel() >= 100) {
-      MultiArraySharedBuffer<std::complex<float>, 4> inputData(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
       MultiArraySharedBuffer<std::complex<float>, 4> visibilities(boost::extents[ps.nrBaselines()][ps.nrChannelsPerSubband()][NR_POLARIZATIONS][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
+      MultiArraySharedBuffer<std::complex<float>, 4> inputData(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
       CorrelatorKernel correlator(ps, queue, program, visibilities, inputData);
 
       //inputData[3][5][99][1] = std::complex<float>(3, 4);
       //inputData[4][5][99][1] = std::complex<float>(5, 6);
-      inputData[2][5][99][1] = std::complex<float>(3, 4);
-      inputData[65][5][99][1] = std::complex<float>(5, 6);
+      inputData[0][5][99][1] = std::complex<float>(3, 4);
+      inputData[2][5][99][1] = std::complex<float>(5, 6);
 
 visibilities.hostToDevice(CL_FALSE);
       inputData.hostToDevice(CL_FALSE);
@@ -1635,6 +1767,70 @@ visibilities.hostToDevice(CL_FALSE);
 };
 
 
+#if defined USE_NEW_CORRELATOR
+
+struct CorrelateRectangleTest : public UnitTest
+{
+  CorrelateRectangleTest(const Parset &ps)
+  :
+    //UnitTest(ps, "Correlator.cl")
+    UnitTest(ps, "NewCorrelator.cl")
+  {
+    if (ps.nrStations() >= 5 && ps.nrChannelsPerSubband() >= 6 && ps.nrSamplesPerChannel() >= 100) {
+      MultiArraySharedBuffer<std::complex<float>, 4> visibilities(boost::extents[ps.nrBaselines()][ps.nrChannelsPerSubband()][NR_POLARIZATIONS][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
+      MultiArraySharedBuffer<std::complex<float>, 4> inputData(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
+      CorrelateRectangleKernel correlator(ps, queue, program, visibilities, inputData);
+
+      inputData[27][5][99][1] = std::complex<float>(3, 4);
+      inputData[68][5][99][1] = std::complex<float>(5, 6);
+
+visibilities.hostToDevice(CL_FALSE);
+      inputData.hostToDevice(CL_FALSE);
+      correlator.enqueue(queue, counter);
+      visibilities.deviceToHost(CL_TRUE);
+
+      //check(visibilities[5463][5][1][1], std::complex<float>(39, 2));
+      for (unsigned bl = 0; bl < ps.nrBaselines(); bl ++)
+	if (visibilities[bl][5][1][1] != std::complex<float>(0, 0))
+	  std::cout << "bl = " << bl << ", visibility = " << visibilities[bl][5][1][1] << std::endl;
+    }
+  }
+};
+
+
+struct CorrelateTriangleTest : public UnitTest
+{
+  CorrelateTriangleTest(const Parset &ps)
+  :
+    //UnitTest(ps, "Correlator.cl")
+    UnitTest(ps, "NewCorrelator.cl")
+  {
+    if (ps.nrStations() >= 5 && ps.nrChannelsPerSubband() >= 6 && ps.nrSamplesPerChannel() >= 100) {
+      MultiArraySharedBuffer<std::complex<float>, 4> visibilities(boost::extents[ps.nrBaselines()][ps.nrChannelsPerSubband()][NR_POLARIZATIONS][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
+      MultiArraySharedBuffer<std::complex<float>, 4> inputData(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
+      CorrelateTriangleKernel correlator(ps, queue, program, visibilities, inputData);
+
+      //inputData[3][5][99][1] = std::complex<float>(3, 4);
+      //inputData[4][5][99][1] = std::complex<float>(5, 6);
+      inputData[0][5][99][1] = std::complex<float>(3, 4);
+      inputData[2][5][99][1] = std::complex<float>(5, 6);
+
+visibilities.hostToDevice(CL_FALSE);
+      inputData.hostToDevice(CL_FALSE);
+      correlator.enqueue(queue, counter);
+      visibilities.deviceToHost(CL_TRUE);
+
+      //check(visibilities[13][5][1][1], std::complex<float>(39, 2));
+      for (unsigned bl = 0; bl < ps.nrBaselines(); bl ++)
+	if (visibilities[bl][5][1][1] != std::complex<float>(0, 0))
+	  std::cout << "bl = " << bl << ", visibility = " << visibilities[bl][5][1][1] << std::endl;
+    }
+  }
+};
+
+#endif
+
+
 struct IncoherentStokesTest : public UnitTest
 {
   IncoherentStokesTest(const Parset &ps)
@@ -1699,10 +1895,10 @@ struct BeamFormerTest : public UnitTest
   :
     UnitTest(ps, "BeamFormer/BeamFormer.cl")
   {
-    if (ps.nrStations() >= 5 && ps.nrSamplesPerChannel() >= 13 && ps.nrChannelsPerSubband() >= 7 && ps.nrTABs(SAP) >= 6) {
+    if (ps.nrStations() >= 5 && ps.nrSamplesPerChannel() >= 13 && ps.nrChannelsPerSubband() >= 7 && ps.nrTABs(0) >= 6) {
       MultiArraySharedBuffer<std::complex<float>, 4> inputData(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
-      MultiArraySharedBuffer<std::complex<float>, 3> beamFormerWeights(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrTABs(SAP)], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
-      MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][ps.nrTABs(SAP)][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE);
+      MultiArraySharedBuffer<std::complex<float>, 3> beamFormerWeights(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrTABs(0)], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
+      MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][ps.nrTABs(0)][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE);
       BeamFormerKernel beamFormer(ps, program, complexVoltages, inputData, beamFormerWeights);
 
       inputData[4][6][12][1] = std::complex<float>(2.2, 3);
@@ -1716,7 +1912,7 @@ struct BeamFormerTest : public UnitTest
       check(complexVoltages[6][12][5][1], std::complex<float>(-6.2, 23));
 
 #if 0
-      for (unsigned tab = 0; tab < ps.nrTABs(SAP); tab ++)
+      for (unsigned tab = 0; tab < ps.nrTABs(0); tab ++)
 	for (unsigned pol = 0; pol < NR_POLARIZATIONS; pol ++)
 	  for (unsigned ch = 0; ch < ps.nrChannelsPerSubband(); ch ++)
 	    for (unsigned t = 0; t < ps.nrSamplesPerChannel(); t ++)
@@ -1734,9 +1930,9 @@ struct BeamFormerTransposeTest : public UnitTest
   :
     UnitTest(ps, "BeamFormer/Transpose.cl")
   {
-    if (ps.nrChannelsPerSubband() >= 19 && ps.nrSamplesPerChannel() >= 175 && ps.nrTABs(SAP) >= 5) {
-      MultiArraySharedBuffer<std::complex<float>, 4> transposedData(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrSamplesPerChannel()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
-      MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][ps.nrTABs(SAP)][NR_POLARIZATIONS], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY);
+    if (ps.nrChannelsPerSubband() >= 19 && ps.nrSamplesPerChannel() >= 175 && ps.nrTABs(0) >= 5) {
+      MultiArraySharedBuffer<std::complex<float>, 4> transposedData(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrSamplesPerChannel()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
+      MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][ps.nrTABs(0)][NR_POLARIZATIONS], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY);
       BeamFormerTransposeKernel transpose(ps, program, transposedData, complexVoltages);
 
       complexVoltages[18][174][4][1] = std::complex<float>(24, 42);
@@ -1757,9 +1953,9 @@ struct DedispersionChirpTest : public UnitTest
   :
     UnitTest(ps, "BeamFormer/Dedispersion.cl")
   {
-    if (ps.nrTABs(SAP) > 3 && ps.nrChannelsPerSubband() > 13 && ps.nrSamplesPerChannel() / ps.dedispersionFFTsize() > 1 && ps.dedispersionFFTsize() > 77) {
-      MultiArraySharedBuffer<std::complex<float>, 5> data(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel() / ps.dedispersionFFTsize()][ps.dedispersionFFTsize()], queue, CL_MEM_READ_WRITE, CL_MEM_READ_WRITE);
-      MultiArraySharedBuffer<float, 1> DMs(boost::extents[ps.nrTABs(SAP)], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
+    if (ps.nrTABs(0) > 3 && ps.nrChannelsPerSubband() > 13 && ps.nrSamplesPerChannel() / ps.dedispersionFFTsize() > 1 && ps.dedispersionFFTsize() > 77) {
+      MultiArraySharedBuffer<std::complex<float>, 5> data(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel() / ps.dedispersionFFTsize()][ps.dedispersionFFTsize()], queue, CL_MEM_READ_WRITE, CL_MEM_READ_WRITE);
+      MultiArraySharedBuffer<float, 1> DMs(boost::extents[ps.nrTABs(0)], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
       DedispersionChirpKernel dedispersionChirpKernel(ps, program, queue, data, DMs);
 
       data[3][1][13][1][77] = std::complex<float>(2, 3);
@@ -1782,16 +1978,16 @@ struct CoherentStokesTest : public UnitTest
   :
     UnitTest(ps, "BeamFormer/CoherentStokes.cl")
   {
-    if (ps.nrChannelsPerSubband() >= 19 && ps.nrSamplesPerChannel() >= 175 && ps.nrTABs(SAP) >= 5) {
-      MultiArraySharedBuffer<float, 4> stokesData(boost::extents[ps.nrTABs(SAP)][ps.nrCoherentStokes()][ps.nrSamplesPerChannel() / ps.coherentStokesTimeIntegrationFactor()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
+    if (ps.nrChannelsPerSubband() >= 19 && ps.nrSamplesPerChannel() >= 175 && ps.nrTABs(0) >= 5) {
+      MultiArraySharedBuffer<float, 4> stokesData(boost::extents[ps.nrTABs(0)][ps.nrCoherentStokes()][ps.nrSamplesPerChannel() / ps.coherentStokesTimeIntegrationFactor()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
 #if 1
-      MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][ps.nrTABs(SAP)][NR_POLARIZATIONS], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY);
+      MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][ps.nrTABs(0)][NR_POLARIZATIONS], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY);
       CoherentStokesKernel stokesKernel(ps, program, stokesData, complexVoltages);
 
       complexVoltages[18][174][4][0] = std::complex<float>(2, 3);
       complexVoltages[18][174][4][1] = std::complex<float>(4, 5);
 #else
-      MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrSamplesPerChannel()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY);
+      MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrSamplesPerChannel()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY);
       CoherentStokesKernel stokesKernel(ps, program, stokesData, complexVoltages);
 
       complexVoltages[18][174][4][0] = std::complex<float>(2, 3);
@@ -1815,10 +2011,10 @@ struct UHEP_BeamFormerTest : public UnitTest
   :
     UnitTest(ps, "UHEP/BeamFormer.cl")
   {
-    if (ps.nrStations() >= 5 && (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) >= 13 && ps.nrSubbands() >= 7 && ps.nrTABs(SAP) >= 6) {
+    if (ps.nrStations() >= 5 && (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) >= 13 && ps.nrSubbands() >= 7 && ps.nrTABs(0) >= 6) {
       MultiArraySharedBuffer<char, 5> inputSamples(boost::extents[ps.nrStations()][ps.nrSubbands()][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][NR_POLARIZATIONS][ps.nrBytesPerComplexSample()], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
-      MultiArraySharedBuffer<std::complex<float>, 3> beamFormerWeights(boost::extents[ps.nrStations()][ps.nrSubbands()][ps.nrTABs(SAP)], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
-      MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrSubbands()][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][ps.nrTABs(SAP)][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE);
+      MultiArraySharedBuffer<std::complex<float>, 3> beamFormerWeights(boost::extents[ps.nrStations()][ps.nrSubbands()][ps.nrTABs(0)], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
+      MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrSubbands()][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][ps.nrTABs(0)][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE);
       UHEP_BeamFormerKernel beamFormer(ps, program, complexVoltages, inputSamples, beamFormerWeights);
 
       switch (ps.nrBytesPerComplexSample()) {
@@ -1851,9 +2047,9 @@ struct UHEP_TransposeTest : public UnitTest
   :
     UnitTest(ps, "UHEP/Transpose.cl")
   {
-    if (ps.nrSubbands() >= 19 && ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1 >= 175 && ps.nrTABs(SAP) >= 5) {
-      MultiArraySharedBuffer<std::complex<float>, 4> transposedData(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][512], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
-      MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrSubbands()][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][ps.nrTABs(SAP)][NR_POLARIZATIONS], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY);
+    if (ps.nrSubbands() >= 19 && ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1 >= 175 && ps.nrTABs(0) >= 5) {
+      MultiArraySharedBuffer<std::complex<float>, 4> transposedData(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][512], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
+      MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrSubbands()][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][ps.nrTABs(0)][NR_POLARIZATIONS], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY);
       cl::Buffer devReverseSubbandMapping(context, CL_MEM_READ_ONLY, 512 * sizeof(int));
       UHEP_TransposeKernel transpose(ps, program, transposedData, complexVoltages, devReverseSubbandMapping);
 
@@ -1876,9 +2072,9 @@ struct UHEP_TriggerTest : public UnitTest
   :
     UnitTest(ps, "UHEP/Trigger.cl")
   {
-    if (ps.nrTABs(SAP) >= 4 && 1024 * ps.nrSamplesPerChannel() > 100015) {
-      MultiArraySharedBuffer<float, 3> inputData(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrSamplesPerChannel() * 1024], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
-      MultiArraySharedBuffer<TriggerInfo, 1> triggerInfo(boost::extents[ps.nrTABs(SAP)], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
+    if (ps.nrTABs(0) >= 4 && 1024 * ps.nrSamplesPerChannel() > 100015) {
+      MultiArraySharedBuffer<float, 3> inputData(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrSamplesPerChannel() * 1024], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
+      MultiArraySharedBuffer<TriggerInfo, 1> triggerInfo(boost::extents[ps.nrTABs(0)], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
       UHEP_TriggerKernel trigger(ps, program, triggerInfo, inputData);
 
       inputData[3][1][100015] = 1000;
@@ -1900,7 +2096,7 @@ struct UHEP_TriggerTest : public UnitTest
 struct FFT_Test : public UnitTest
 {
   FFT_Test(const Parset &ps)
-  : UnitTest(ps, "fft2.cl")
+  : UnitTest(ps, "fft.cl")
   {
     MultiArraySharedBuffer<std::complex<float>, 1> in(boost::extents[8], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY);
     MultiArraySharedBuffer<std::complex<float>, 1> out(boost::extents[8], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY);
@@ -1979,34 +2175,14 @@ int main(int argc, char **argv)
     const char *str = getenv("NR_GPUS");
     nrGPUs = str ? atoi(str) : 1;
 
-#if 0
-    ps.nrSubbands()		= 10;//488;
-    ps.nrChannelsPerSubband()	= 64;
-    ps.nrBeams()		= 1;
-    ps.nrSamplesPerChannel()	= 196608 / ps.nrChannelsPerSubband();
-    ps.subbandBandwidth()	= 195312.5;
-    ps.correctBandPass()	= true;
-#endif
+    profiling = false; CorrelatorPipeline(ps).doWork();
+    profiling = true; CorrelatorPipeline(ps).doWork();
 
-    //profiling = false; CorrelatorPipeline(ps).doWork();
-    //profiling = true; CorrelatorPipeline(ps).doWork();
-
-    (CorrelatorTest)(ps);
+    //(CorrelatorTest)(ps);
+    //(CorrelateRectangleTest)(ps);
+    //(CorrelateTriangleTest)(ps);
 
 #if 0
-    ps.nrSubbands()		  = 488;
-    ps.nrChannelsPerSubband()	  = 2048;
-    ps.nrBeams()		  = 1;
-    ps.nrTABs(SAP)			  = 128;
-    ps.nrIncoherentStokes()	  = 4;
-    ps.nrCoherentStokes()	  = 4;
-    ps.incoherentStokesTimeIntegrationFactor() = 8;
-    ps.coherentStokesTimeIntegrationFactor() = 8;
-    ps.nrSamplesPerChannel()	  = 65536 / ps.nrChannelsPerSubband();//262144 / ps.nrChannelsPerSubband();
-    ps.subbandBandwidth()	  = 195312.5;
-    ps.correctBandPass()	  = true;
-    ps.dedispersionFFTsize()	  = ps.nrSamplesPerChannel();
-  
     profiling = false; BeamFormerPipeline(ps).doWork();
     profiling = true; BeamFormerPipeline(ps).doWork();
     //(IncoherentStokesTest)(ps);
@@ -2018,12 +2194,6 @@ int main(int argc, char **argv)
 #endif
 
 #if 0
-    ps.nrSubbands()	   = 488;
-    ps.nrSamplesPerChannel() = 1024;
-    ps.nrBeams()	   = 1;
-    ps.subbandBandwidth()  = 195312.5;
-    ps.nrTABs(SAP)		   = 48;
-
     profiling = false; UHEP_Pipeline(ps).doWork();
     profiling = true; UHEP_Pipeline(ps).doWork();
     //(UHEP_BeamFormerTest)(ps);
diff --git a/RTCP/GPUProc/src/UHEP/BeamFormer.cl-0.ptx b/RTCP/GPUProc/src/UHEP/BeamFormer.cl-0.ptx
deleted file mode 100644
index ea220810bb76a4961820e5b0583dc8ffe90325aa..0000000000000000000000000000000000000000
--- a/RTCP/GPUProc/src/UHEP/BeamFormer.cl-0.ptx
+++ /dev/null
@@ -1,207 +0,0 @@
-//
-// Generated by NVIDIA NVVM Compiler
-// Compiler built on Tue Feb  7 07:15:59 2012 (1328595359)
-// Driver 295.20
-//
-
-.version 3.0
-.target sm_21, texmode_independent
-.address_size 32
-
-.extern .shared .align 16 .b8 shr_2__local[2048];
-
-.entry complexVoltages(
-	.param .u32 .ptr .global .align 1 complexVoltages_param_0,
-	.param .u32 .ptr .global .align 1 complexVoltages_param_1,
-	.param .u32 .ptr .global .align 1 complexVoltages_param_2
-)
-{
-	.reg .f32 	%f<173>;
-	.reg .pred 	%p<7>;
-	.reg .s32 	%r<76>;
-	.reg .s16 	%rc<16>;
-
-
-	ld.param.u32 	%r29, [complexVoltages_param_2];
-	// inline asm
-	mov.u32 	%r22, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r23, %tid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r24, %envreg5;
-	// inline asm
-	// inline asm
-	mov.u32 	%r25, %ntid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r26, %ctaid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r27, %tid.z;
-	// inline asm
-	add.s32 	%r30, %r27, %r24;
-	mad.lo.s32 	%r4, %r26, %r25, %r30;
-	shl.b32 	%r31, %r23, 4;
-	mad.lo.s32 	%r32, %r4, 66496, %r31;
-	shl.b32 	%r33, %r22, 3;
-	add.s32 	%r5, %r32, %r33;
-	shl.b32 	%r34, %r4, 5;
-	shl.b32 	%r35, %r23, 3;
-	add.s32 	%r36, %r34, %r35;
-	add.s32 	%r37, %r29, %r36;
-	ld.global.v2.f32 	{%f131, %f132}, [%r37];
-	ld.global.v2.f32 	{%f137, %f138}, [%r37+15616];
-	ld.global.v2.f32 	{%f143, %f144}, [%r37+31232];
-	ld.global.v2.f32 	{%f149, %f150}, [%r37+46848];
-	ld.global.v2.f32 	{%f155, %f156}, [%r37+62464];
-	ld.global.v2.f32 	{%f161, %f162}, [%r37+78080];
-	ld.global.v2.f32 	{%f167, %f168}, [%r37+93696];
-	mov.u32 	%r72, 0;
-
-BB0_1:
-	shl.b32 	%r40, %r72, 10;
-	add.s32 	%r7, %r5, %r40;
-	// inline asm
-	mov.u32 	%r38, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r39, %tid.y;
-	// inline asm
-	shl.b32 	%r9, %r39, 1;
-	add.s32 	%r74, %r9, %r38;
-	setp.gt.u32 	%p1, %r74, 127;
-	@%p1 bra 	BB0_6;
-
-	add.s32 	%r11, %r38, %r9;
-	cvt.u8.u32 	%rc1, %r11;
-	mov.u16 	%rc15, 0;
-	mov.u32 	%r73, 0;
-
-BB0_3:
-	add.s32 	%r14, %r11, %r73;
-	shl.b32 	%r42, %r72, 4;
-	shl.b16 	%rc5, %rc15, 3;
-	add.s16 	%rc6, %rc1, %rc5;
-	cvt.u32.u8 	%r43, %rc6;
-	and.b32  	%r44, %r43, 15;
-	add.s32 	%r15, %r42, %r44;
-	setp.gt.u32 	%p2, %r15, 1038;
-	@%p2 bra 	BB0_5;
-
-	shr.u32 	%r45, %r14, 4;
-	ld.param.u32 	%r71, [complexVoltages_param_1];
-	mad.lo.s32 	%r46, %r45, 2028128, %r71;
-	mad.lo.s32 	%r47, %r4, 4156, %r46;
-	shl.b32 	%r48, %r15, 2;
-	add.s32 	%r49, %r47, %r48;
-	ld.global.v4.u8 	{%rc11, %rc12, %rc13, %rc14}, [%r49];
-	// inline asm
-	cvt.rn.f32.s8 	%f1, %rc11;
-	// inline asm
-	// inline asm
-	cvt.rn.f32.s8 	%f2, %rc12;
-	// inline asm
-	// inline asm
-	cvt.rn.f32.s8 	%f3, %rc13;
-	// inline asm
-	// inline asm
-	cvt.rn.f32.s8 	%f4, %rc14;
-	// inline asm
-	shl.b32 	%r50, %r74, 4;
-	mov.u32 	%r51, shr_2__local;
-	add.s32 	%r52, %r51, %r50;
-	st.shared.v4.f32 	[%r52], {%f1, %f2, %f3, %f4};
-
-BB0_5:
-	add.s32 	%r74, %r74, 8;
-	setp.lt.u32 	%p3, %r74, 128;
-	add.s32 	%r73, %r73, 8;
-	add.s16 	%rc15, %rc15, 1;
-	@%p3 bra 	BB0_3;
-
-BB0_6:
-	bar.sync 	0;
-	mov.u32 	%r56, 1039;
-	shl.b32 	%r57, %r72, 4;
-	sub.s32 	%r18, %r56, %r57;
-	mov.u32 	%r54, 16;
-	// inline asm
-	min.u32 	%r53, %r54, %r18;
-	// inline asm
-	setp.eq.s32 	%p4, %r53, 0;
-	@%p4 bra 	BB0_9;
-
-	mov.u32 	%r75, 0;
-
-BB0_8:
-	shl.b32 	%r62, %r75, 6;
-	add.s32 	%r63, %r7, %r62;
-	ld.param.u32 	%r70, [complexVoltages_param_0];
-	add.s32 	%r64, %r70, %r63;
-	shl.b32 	%r65, %r75, 4;
-	mov.u32 	%r66, shr_2__local;
-	add.s32 	%r67, %r66, %r65;
-	add.s32 	%r69, %r67, %r33;
-	ld.shared.v2.f32 	{%f27, %f28}, [%r69];
-	mov.f32 	%f5, 0f00000000;
-	fma.rn.ftz.f32 	%f31, %f131, %f27, %f5;
-	fma.rn.ftz.f32 	%f32, %f131, %f28, %f5;
-	neg.ftz.f32 	%f7, %f28;
-	fma.rn.ftz.f32 	%f39, %f132, %f7, %f31;
-	fma.rn.ftz.f32 	%f40, %f132, %f27, %f32;
-	ld.shared.v2.f32 	{%f43, %f44}, [%r69+256];
-	fma.rn.ftz.f32 	%f45, %f137, %f43, %f39;
-	fma.rn.ftz.f32 	%f46, %f137, %f44, %f40;
-	neg.ftz.f32 	%f10, %f44;
-	fma.rn.ftz.f32 	%f53, %f138, %f10, %f45;
-	fma.rn.ftz.f32 	%f54, %f138, %f43, %f46;
-	ld.shared.v2.f32 	{%f57, %f58}, [%r69+512];
-	fma.rn.ftz.f32 	%f59, %f143, %f57, %f53;
-	fma.rn.ftz.f32 	%f60, %f143, %f58, %f54;
-	neg.ftz.f32 	%f13, %f58;
-	fma.rn.ftz.f32 	%f67, %f144, %f13, %f59;
-	fma.rn.ftz.f32 	%f68, %f144, %f57, %f60;
-	ld.shared.v2.f32 	{%f71, %f72}, [%r69+768];
-	fma.rn.ftz.f32 	%f73, %f149, %f71, %f67;
-	fma.rn.ftz.f32 	%f74, %f149, %f72, %f68;
-	neg.ftz.f32 	%f16, %f72;
-	fma.rn.ftz.f32 	%f81, %f150, %f16, %f73;
-	fma.rn.ftz.f32 	%f82, %f150, %f71, %f74;
-	ld.shared.v2.f32 	{%f85, %f86}, [%r69+1024];
-	fma.rn.ftz.f32 	%f87, %f155, %f85, %f81;
-	fma.rn.ftz.f32 	%f88, %f155, %f86, %f82;
-	neg.ftz.f32 	%f19, %f86;
-	fma.rn.ftz.f32 	%f95, %f156, %f19, %f87;
-	fma.rn.ftz.f32 	%f96, %f156, %f85, %f88;
-	ld.shared.v2.f32 	{%f99, %f100}, [%r69+1280];
-	fma.rn.ftz.f32 	%f101, %f161, %f99, %f95;
-	fma.rn.ftz.f32 	%f102, %f161, %f100, %f96;
-	neg.ftz.f32 	%f22, %f100;
-	fma.rn.ftz.f32 	%f109, %f162, %f22, %f101;
-	fma.rn.ftz.f32 	%f110, %f162, %f99, %f102;
-	ld.shared.v2.f32 	{%f113, %f114}, [%r69+1536];
-	fma.rn.ftz.f32 	%f115, %f167, %f113, %f109;
-	fma.rn.ftz.f32 	%f116, %f167, %f114, %f110;
-	neg.ftz.f32 	%f25, %f114;
-	fma.rn.ftz.f32 	%f123, %f168, %f25, %f115;
-	fma.rn.ftz.f32 	%f124, %f168, %f113, %f116;
-	st.global.v2.f32 	[%r64], {%f123, %f124};
-	// inline asm
-	min.u32 	%r59, %r54, %r18;
-	// inline asm
-	add.s32 	%r75, %r75, 1;
-	setp.lt.u32 	%p5, %r75, %r59;
-	@%p5 bra 	BB0_8;
-
-BB0_9:
-	bar.sync 	0;
-	add.s32 	%r72, %r72, 1;
-	setp.ne.s32 	%p6, %r72, 65;
-	@%p6 bra 	BB0_1;
-
-	ret;
-}
-
-
diff --git a/RTCP/GPUProc/src/UHEP/BeamFormer.cl.hop-0.ptx b/RTCP/GPUProc/src/UHEP/BeamFormer.cl.hop-0.ptx
deleted file mode 100644
index 79498463a835875a85828f52fd7f756eb2b37aab..0000000000000000000000000000000000000000
Binary files a/RTCP/GPUProc/src/UHEP/BeamFormer.cl.hop-0.ptx and /dev/null differ
diff --git a/RTCP/GPUProc/src/UHEP/InvFFT.cl-0.ptx b/RTCP/GPUProc/src/UHEP/InvFFT.cl-0.ptx
deleted file mode 100644
index 86d5c2b091806e681f5dfe3df13f2ce5d1d68535..0000000000000000000000000000000000000000
--- a/RTCP/GPUProc/src/UHEP/InvFFT.cl-0.ptx
+++ /dev/null
@@ -1,759 +0,0 @@
-//
-// Generated by NVIDIA NVVM Compiler
-// Compiler built on Tue Feb  7 07:15:59 2012 (1328595359)
-// Driver 295.20
-//
-
-.version 3.0
-.target sm_21, texmode_independent
-.address_size 32
-
-.extern .shared .align 32 .b8 shr_3_lds[4096];
-
-.entry inv_fft(
-	.param .u32 .ptr .global .align 8 inv_fft_param_0,
-	.param .u32 .ptr .global .align 4 inv_fft_param_1
-)
-.reqntid 128, 1, 1
-{
-	.reg .f32 	%f<576>;
-	.reg .pred 	%p<2>;
-	.reg .s32 	%r<212>;
-
-
-	ld.param.u32 	%r26, [inv_fft_param_1];
-	// inline asm
-	mov.u32 	%r2, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r3, %envreg4;
-	// inline asm
-	// inline asm
-	mov.u32 	%r4, %ntid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r5, %ctaid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r6, %tid.y;
-	// inline asm
-	add.s32 	%r27, %r6, %r3;
-	mad.lo.s32 	%r28, %r5, %r4, %r27;
-	shl.b32 	%r29, %r28, 12;
-	add.s32 	%r30, %r26, %r29;
-	shl.b32 	%r31, %r2, 3;
-	add.s32 	%r32, %r30, %r31;
-	ld.global.v2.f32 	{%f574, %f575}, [%r32];
-	// inline asm
-	mov.u32 	%r7, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r8, %envreg4;
-	// inline asm
-	// inline asm
-	mov.u32 	%r9, %ntid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r10, %ctaid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r11, %tid.y;
-	// inline asm
-	add.s32 	%r33, %r11, %r8;
-	mad.lo.s32 	%r34, %r10, %r9, %r33;
-	shl.b32 	%r35, %r34, 12;
-	add.s32 	%r36, %r26, %r35;
-	shl.b32 	%r37, %r7, 3;
-	add.s32 	%r38, %r37, %r36;
-	ld.global.v2.f32 	{%f516, %f517}, [%r38+1024];
-	// inline asm
-	mov.u32 	%r12, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r13, %envreg4;
-	// inline asm
-	// inline asm
-	mov.u32 	%r14, %ntid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r15, %ctaid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r16, %tid.y;
-	// inline asm
-	add.s32 	%r39, %r16, %r13;
-	mad.lo.s32 	%r40, %r15, %r14, %r39;
-	shl.b32 	%r41, %r40, 12;
-	add.s32 	%r42, %r26, %r41;
-	shl.b32 	%r43, %r12, 3;
-	add.s32 	%r44, %r43, %r42;
-	ld.global.v2.f32 	{%f510, %f511}, [%r44+2048];
-	// inline asm
-	mov.u32 	%r17, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r18, %envreg4;
-	// inline asm
-	// inline asm
-	mov.u32 	%r19, %ntid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r20, %ctaid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r21, %tid.y;
-	// inline asm
-	add.s32 	%r45, %r21, %r18;
-	mad.lo.s32 	%r46, %r20, %r19, %r45;
-	shl.b32 	%r47, %r46, 12;
-	add.s32 	%r48, %r26, %r47;
-	shl.b32 	%r49, %r17, 3;
-	add.s32 	%r50, %r49, %r48;
-	ld.global.v2.f32 	{%f522, %f523}, [%r50+3072];
-	// inline asm
-	mov.u32 	%r22, %tid.x;
-	// inline asm
-	shl.b32 	%r51, %r22, 3;
-	mov.u32 	%r52, shr_3_lds;
-	add.s32 	%r53, %r52, %r51;
-	st.shared.v2.f32 	[%r53], {%f574, %f575};
-	// inline asm
-	mov.u32 	%r23, %tid.x;
-	// inline asm
-	shl.b32 	%r54, %r23, 3;
-	add.s32 	%r55, %r54, %r52;
-	st.shared.v2.f32 	[%r55+1024], {%f516, %f517};
-	// inline asm
-	mov.u32 	%r24, %tid.x;
-	// inline asm
-	shl.b32 	%r56, %r24, 3;
-	add.s32 	%r57, %r56, %r52;
-	st.shared.v2.f32 	[%r57+2048], {%f510, %f511};
-	// inline asm
-	mov.u32 	%r25, %tid.x;
-	// inline asm
-	shl.b32 	%r58, %r25, 3;
-	add.s32 	%r59, %r58, %r52;
-	st.shared.v2.f32 	[%r59+3072], {%f522, %f523};
-	bar.sync 	0;
-	// inline asm
-	mov.u32 	%r60, %tid.x;
-	// inline asm
-	setp.eq.s32 	%p1, %r60, 0;
-	@%p1 bra 	BB0_2;
-
-	// inline asm
-	mov.u32 	%r61, %tid.x;
-	// inline asm
-	mov.u32 	%r62, 512;
-	sub.s32 	%r63, %r62, %r61;
-	shl.b32 	%r64, %r63, 3;
-	add.s32 	%r66, %r52, %r64;
-	ld.shared.v2.f32 	{%f572, %f573}, [%r66];
-	bra.uni 	BB0_3;
-
-BB0_2:
-	mov.f32 	%f17, 0f00000000;
-	mov.f32 	%f572, %f575;
-	mov.f32 	%f573, %f17;
-	mov.f32 	%f574, %f574;
-	mov.f32 	%f575, %f17;
-
-BB0_3:
-	// inline asm
-	mov.u32 	%r67, %tid.x;
-	// inline asm
-	mov.u32 	%r70, 384;
-	sub.s32 	%r71, %r70, %r67;
-	shl.b32 	%r72, %r71, 3;
-	add.s32 	%r74, %r52, %r72;
-	ld.shared.v2.f32 	{%f490, %f491}, [%r74];
-	// inline asm
-	mov.u32 	%r68, %tid.x;
-	// inline asm
-	mov.u32 	%r75, 256;
-	sub.s32 	%r76, %r75, %r68;
-	shl.b32 	%r77, %r76, 3;
-	add.s32 	%r78, %r52, %r77;
-	ld.shared.v2.f32 	{%f492, %f493}, [%r78];
-	// inline asm
-	mov.u32 	%r69, %tid.x;
-	// inline asm
-	mov.u32 	%r79, 128;
-	sub.s32 	%r80, %r79, %r69;
-	shl.b32 	%r81, %r80, 3;
-	add.s32 	%r82, %r52, %r81;
-	neg.ftz.f32 	%f20, %f573;
-	neg.ftz.f32 	%f22, %f491;
-	neg.ftz.f32 	%f24, %f493;
-	ld.shared.v2.f32 	{%f500, %f501}, [%r82];
-	neg.ftz.f32 	%f26, %f501;
-	add.ftz.f32 	%f504, %f574, %f572;
-	add.ftz.f32 	%f505, %f575, %f20;
-	sub.ftz.f32 	%f506, %f574, %f572;
-	sub.ftz.f32 	%f507, %f575, %f20;
-	add.ftz.f32 	%f508, %f510, %f492;
-	add.ftz.f32 	%f509, %f511, %f24;
-	sub.ftz.f32 	%f512, %f510, %f492;
-	sub.ftz.f32 	%f513, %f511, %f24;
-	add.ftz.f32 	%f514, %f516, %f490;
-	add.ftz.f32 	%f515, %f517, %f22;
-	sub.ftz.f32 	%f518, %f516, %f490;
-	sub.ftz.f32 	%f519, %f517, %f22;
-	add.ftz.f32 	%f520, %f522, %f500;
-	add.ftz.f32 	%f521, %f523, %f26;
-	sub.ftz.f32 	%f524, %f522, %f500;
-	sub.ftz.f32 	%f525, %f523, %f26;
-	add.ftz.f32 	%f526, %f504, %f508;
-	add.ftz.f32 	%f527, %f505, %f509;
-	sub.ftz.f32 	%f528, %f504, %f508;
-	sub.ftz.f32 	%f529, %f505, %f509;
-	neg.ftz.f32 	%f28, %f513;
-	add.ftz.f32 	%f534, %f506, %f28;
-	add.ftz.f32 	%f535, %f507, %f512;
-	sub.ftz.f32 	%f536, %f506, %f28;
-	sub.ftz.f32 	%f537, %f507, %f512;
-	add.ftz.f32 	%f538, %f514, %f520;
-	add.ftz.f32 	%f539, %f515, %f521;
-	sub.ftz.f32 	%f540, %f514, %f520;
-	sub.ftz.f32 	%f541, %f515, %f521;
-	neg.ftz.f32 	%f31, %f525;
-	add.ftz.f32 	%f546, %f518, %f31;
-	add.ftz.f32 	%f547, %f519, %f524;
-	sub.ftz.f32 	%f548, %f518, %f31;
-	sub.ftz.f32 	%f549, %f519, %f524;
-	add.ftz.f32 	%f446, %f526, %f538;
-	add.ftz.f32 	%f447, %f527, %f539;
-	sub.ftz.f32 	%f454, %f526, %f538;
-	sub.ftz.f32 	%f455, %f527, %f539;
-	neg.ftz.f32 	%f34, %f547;
-	add.ftz.f32 	%f554, %f34, %f546;
-	add.ftz.f32 	%f555, %f546, %f547;
-	mov.f32 	%f36, 0f3F3504F3;
-	fma.rn.ftz.f32 	%f448, %f554, %f36, %f534;
-	fma.rn.ftz.f32 	%f449, %f555, %f36, %f535;
-	neg.f32 	%f558, %f554;
-	neg.f32 	%f559, %f555;
-	fma.rn.ftz.f32 	%f456, %f558, %f36, %f534;
-	fma.rn.ftz.f32 	%f457, %f559, %f36, %f535;
-	neg.ftz.f32 	%f38, %f541;
-	add.ftz.f32 	%f450, %f528, %f38;
-	add.ftz.f32 	%f451, %f529, %f540;
-	sub.ftz.f32 	%f458, %f528, %f38;
-	sub.ftz.f32 	%f459, %f529, %f540;
-	neg.ftz.f32 	%f41, %f549;
-	sub.ftz.f32 	%f568, %f41, %f548;
-	sub.ftz.f32 	%f569, %f548, %f549;
-	fma.rn.ftz.f32 	%f452, %f568, %f36, %f536;
-	fma.rn.ftz.f32 	%f453, %f569, %f36, %f537;
-	neg.f32 	%f570, %f568;
-	neg.f32 	%f571, %f569;
-	fma.rn.ftz.f32 	%f460, %f570, %f36, %f536;
-	fma.rn.ftz.f32 	%f461, %f571, %f36, %f537;
-	bar.sync 	0;
-	// inline asm
-	mov.u32 	%r83, %tid.x;
-	// inline asm
-	shl.b32 	%r84, %r83, 5;
-	add.s32 	%r86, %r52, %r84;
-	st.shared.v4.f32 	[%r86+16], {%f454, %f456, %f458, %f460};
-	st.shared.v4.f32 	[%r86], {%f446, %f448, %f450, %f452};
-	bar.sync 	0;
-	// inline asm
-	mov.u32 	%r87, %tid.x;
-	// inline asm
-	shl.b32 	%r95, %r87, 2;
-	add.s32 	%r97, %r52, %r95;
-	ld.shared.f32 	%f51, [%r97];
-	// inline asm
-	mov.u32 	%r88, %tid.x;
-	// inline asm
-	shl.b32 	%r98, %r88, 2;
-	add.s32 	%r99, %r98, %r52;
-	ld.shared.f32 	%f1, [%r99+512];
-	// inline asm
-	mov.u32 	%r89, %tid.x;
-	// inline asm
-	shl.b32 	%r100, %r89, 2;
-	add.s32 	%r101, %r100, %r52;
-	ld.shared.f32 	%f2, [%r101+1024];
-	// inline asm
-	mov.u32 	%r90, %tid.x;
-	// inline asm
-	shl.b32 	%r102, %r90, 2;
-	add.s32 	%r103, %r102, %r52;
-	ld.shared.f32 	%f3, [%r103+1536];
-	// inline asm
-	mov.u32 	%r91, %tid.x;
-	// inline asm
-	shl.b32 	%r104, %r91, 2;
-	add.s32 	%r105, %r104, %r52;
-	ld.shared.f32 	%f4, [%r105+2048];
-	// inline asm
-	mov.u32 	%r92, %tid.x;
-	// inline asm
-	shl.b32 	%r106, %r92, 2;
-	add.s32 	%r107, %r106, %r52;
-	ld.shared.f32 	%f5, [%r107+2560];
-	// inline asm
-	mov.u32 	%r93, %tid.x;
-	// inline asm
-	shl.b32 	%r108, %r93, 2;
-	add.s32 	%r109, %r108, %r52;
-	ld.shared.f32 	%f6, [%r109+3072];
-	// inline asm
-	mov.u32 	%r94, %tid.x;
-	// inline asm
-	shl.b32 	%r110, %r94, 2;
-	add.s32 	%r111, %r110, %r52;
-	ld.shared.f32 	%f7, [%r111+3584];
-	bar.sync 	0;
-	// inline asm
-	mov.u32 	%r112, %tid.x;
-	// inline asm
-	shl.b32 	%r113, %r112, 5;
-	add.s32 	%r115, %r52, %r113;
-	st.shared.v4.f32 	[%r115+16], {%f455, %f457, %f459, %f461};
-	st.shared.v4.f32 	[%r115], {%f447, %f449, %f451, %f453};
-	bar.sync 	0;
-	// inline asm
-	mov.u32 	%r116, %tid.x;
-	// inline asm
-	shl.b32 	%r124, %r116, 2;
-	add.s32 	%r126, %r52, %r124;
-	ld.shared.f32 	%f61, [%r126];
-	// inline asm
-	mov.u32 	%r117, %tid.x;
-	// inline asm
-	shl.b32 	%r127, %r117, 2;
-	add.s32 	%r128, %r127, %r52;
-	ld.shared.f32 	%f8, [%r128+512];
-	// inline asm
-	mov.u32 	%r118, %tid.x;
-	// inline asm
-	shl.b32 	%r129, %r118, 2;
-	add.s32 	%r130, %r129, %r52;
-	ld.shared.f32 	%f9, [%r130+1024];
-	// inline asm
-	mov.u32 	%r119, %tid.x;
-	// inline asm
-	shl.b32 	%r131, %r119, 2;
-	add.s32 	%r132, %r131, %r52;
-	ld.shared.f32 	%f10, [%r132+1536];
-	// inline asm
-	mov.u32 	%r120, %tid.x;
-	// inline asm
-	shl.b32 	%r133, %r120, 2;
-	add.s32 	%r134, %r133, %r52;
-	ld.shared.f32 	%f11, [%r134+2048];
-	// inline asm
-	mov.u32 	%r121, %tid.x;
-	// inline asm
-	shl.b32 	%r135, %r121, 2;
-	add.s32 	%r136, %r135, %r52;
-	ld.shared.f32 	%f12, [%r136+2560];
-	// inline asm
-	mov.u32 	%r122, %tid.x;
-	// inline asm
-	shl.b32 	%r137, %r122, 2;
-	add.s32 	%r138, %r137, %r52;
-	ld.shared.f32 	%f13, [%r138+3072];
-	// inline asm
-	mov.u32 	%r123, %tid.x;
-	// inline asm
-	shl.b32 	%r139, %r123, 2;
-	add.s32 	%r140, %r139, %r52;
-	ld.shared.f32 	%f14, [%r140+3584];
-	bar.sync 	0;
-	mov.f32 	%f90, 0f40000000;
-	mul.ftz.f32 	%f15, %f90, 0f40490FDB;
-	div.rn.ftz.f32 	%f91, %f15, 0f42800000;
-	// inline asm
-	mov.u32 	%r141, %tid.x;
-	// inline asm
-	and.b32  	%r144, %r141, 7;
-	cvt.rn.f32.u32 	%f92, %r144;
-	mul.ftz.f32 	%f65, %f91, %f92;
-	// inline asm
-	cos.approx.f32 	%f62, %f65;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f64, %f65;
-	// inline asm
-	mul.ftz.f32 	%f93, %f62, %f1;
-	neg.f32 	%f94, %f64;
-	fma.rn.ftz.f32 	%f95, %f94, %f8, %f93;
-	mul.ftz.f32 	%f96, %f64, %f1;
-	fma.rn.ftz.f32 	%f97, %f62, %f8, %f96;
-	fma.rn.ftz.f32 	%f67, %f91, %f92, %f65;
-	// inline asm
-	cos.approx.f32 	%f66, %f67;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f68, %f67;
-	// inline asm
-	mul.ftz.f32 	%f98, %f66, %f2;
-	neg.f32 	%f99, %f68;
-	fma.rn.ftz.f32 	%f100, %f99, %f9, %f98;
-	mul.ftz.f32 	%f101, %f68, %f2;
-	fma.rn.ftz.f32 	%f102, %f66, %f9, %f101;
-	mul.ftz.f32 	%f73, %f65, 0f40400000;
-	// inline asm
-	cos.approx.f32 	%f70, %f73;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f72, %f73;
-	// inline asm
-	mul.ftz.f32 	%f103, %f70, %f3;
-	neg.f32 	%f104, %f72;
-	fma.rn.ftz.f32 	%f105, %f104, %f10, %f103;
-	mul.ftz.f32 	%f106, %f72, %f3;
-	fma.rn.ftz.f32 	%f107, %f70, %f10, %f106;
-	mul.ftz.f32 	%f77, %f65, 0f40800000;
-	// inline asm
-	cos.approx.f32 	%f74, %f77;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f76, %f77;
-	// inline asm
-	mul.ftz.f32 	%f108, %f74, %f4;
-	neg.f32 	%f109, %f76;
-	fma.rn.ftz.f32 	%f110, %f109, %f11, %f108;
-	mul.ftz.f32 	%f111, %f76, %f4;
-	fma.rn.ftz.f32 	%f112, %f74, %f11, %f111;
-	mul.ftz.f32 	%f81, %f65, 0f40A00000;
-	// inline asm
-	cos.approx.f32 	%f78, %f81;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f80, %f81;
-	// inline asm
-	mul.ftz.f32 	%f113, %f78, %f5;
-	neg.f32 	%f114, %f80;
-	fma.rn.ftz.f32 	%f115, %f114, %f12, %f113;
-	mul.ftz.f32 	%f116, %f80, %f5;
-	fma.rn.ftz.f32 	%f117, %f78, %f12, %f116;
-	mul.ftz.f32 	%f85, %f65, 0f40C00000;
-	// inline asm
-	cos.approx.f32 	%f82, %f85;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f84, %f85;
-	// inline asm
-	mul.ftz.f32 	%f118, %f82, %f6;
-	neg.f32 	%f119, %f84;
-	fma.rn.ftz.f32 	%f120, %f119, %f13, %f118;
-	mul.ftz.f32 	%f121, %f84, %f6;
-	fma.rn.ftz.f32 	%f122, %f82, %f13, %f121;
-	mul.ftz.f32 	%f89, %f65, 0f40E00000;
-	// inline asm
-	cos.approx.f32 	%f86, %f89;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f88, %f89;
-	// inline asm
-	mul.ftz.f32 	%f123, %f86, %f7;
-	neg.f32 	%f124, %f88;
-	fma.rn.ftz.f32 	%f125, %f124, %f14, %f123;
-	mul.ftz.f32 	%f126, %f88, %f7;
-	fma.rn.ftz.f32 	%f127, %f86, %f14, %f126;
-	add.ftz.f32 	%f380, %f51, %f110;
-	add.ftz.f32 	%f381, %f61, %f112;
-	sub.ftz.f32 	%f384, %f51, %f110;
-	sub.ftz.f32 	%f385, %f61, %f112;
-	add.ftz.f32 	%f386, %f100, %f120;
-	add.ftz.f32 	%f387, %f102, %f122;
-	sub.ftz.f32 	%f388, %f100, %f120;
-	sub.ftz.f32 	%f389, %f102, %f122;
-	add.ftz.f32 	%f390, %f95, %f115;
-	add.ftz.f32 	%f391, %f97, %f117;
-	sub.ftz.f32 	%f392, %f95, %f115;
-	sub.ftz.f32 	%f393, %f97, %f117;
-	add.ftz.f32 	%f394, %f105, %f125;
-	add.ftz.f32 	%f395, %f107, %f127;
-	sub.ftz.f32 	%f396, %f105, %f125;
-	sub.ftz.f32 	%f397, %f107, %f127;
-	add.ftz.f32 	%f398, %f380, %f386;
-	add.ftz.f32 	%f399, %f381, %f387;
-	sub.ftz.f32 	%f400, %f380, %f386;
-	sub.ftz.f32 	%f401, %f381, %f387;
-	neg.ftz.f32 	%f129, %f389;
-	add.ftz.f32 	%f406, %f384, %f129;
-	add.ftz.f32 	%f407, %f385, %f388;
-	sub.ftz.f32 	%f408, %f384, %f129;
-	sub.ftz.f32 	%f409, %f385, %f388;
-	add.ftz.f32 	%f410, %f390, %f394;
-	add.ftz.f32 	%f411, %f391, %f395;
-	sub.ftz.f32 	%f412, %f390, %f394;
-	sub.ftz.f32 	%f413, %f391, %f395;
-	neg.ftz.f32 	%f132, %f397;
-	add.ftz.f32 	%f418, %f392, %f132;
-	add.ftz.f32 	%f419, %f393, %f396;
-	sub.ftz.f32 	%f420, %f392, %f132;
-	sub.ftz.f32 	%f421, %f393, %f396;
-	add.ftz.f32 	%f348, %f398, %f410;
-	add.ftz.f32 	%f349, %f399, %f411;
-	sub.ftz.f32 	%f356, %f398, %f410;
-	sub.ftz.f32 	%f357, %f399, %f411;
-	neg.ftz.f32 	%f135, %f419;
-	add.ftz.f32 	%f426, %f135, %f418;
-	add.ftz.f32 	%f427, %f418, %f419;
-	fma.rn.ftz.f32 	%f350, %f426, %f36, %f406;
-	fma.rn.ftz.f32 	%f351, %f427, %f36, %f407;
-	neg.f32 	%f430, %f426;
-	neg.f32 	%f431, %f427;
-	fma.rn.ftz.f32 	%f358, %f430, %f36, %f406;
-	fma.rn.ftz.f32 	%f359, %f431, %f36, %f407;
-	neg.ftz.f32 	%f139, %f413;
-	add.ftz.f32 	%f352, %f400, %f139;
-	add.ftz.f32 	%f353, %f401, %f412;
-	sub.ftz.f32 	%f360, %f400, %f139;
-	sub.ftz.f32 	%f361, %f401, %f412;
-	neg.ftz.f32 	%f142, %f421;
-	sub.ftz.f32 	%f440, %f142, %f420;
-	sub.ftz.f32 	%f441, %f420, %f421;
-	fma.rn.ftz.f32 	%f354, %f440, %f36, %f408;
-	fma.rn.ftz.f32 	%f355, %f441, %f36, %f409;
-	neg.f32 	%f442, %f440;
-	neg.f32 	%f443, %f441;
-	fma.rn.ftz.f32 	%f362, %f442, %f36, %f408;
-	fma.rn.ftz.f32 	%f363, %f443, %f36, %f409;
-	// inline asm
-	mov.u32 	%r142, %tid.x;
-	// inline asm
-	shl.b32 	%r145, %r142, 3;
-	// inline asm
-	mov.u32 	%r143, %tid.x;
-	// inline asm
-	and.b32  	%r146, %r143, 7;
-	and.b32  	%r147, %r145, 1073741760;
-	add.s32 	%r148, %r147, %r146;
-	shl.b32 	%r149, %r148, 2;
-	add.s32 	%r1, %r52, %r149;
-	st.shared.f32 	[%r1], %f348;
-	st.shared.f32 	[%r1+32], %f350;
-	st.shared.f32 	[%r1+64], %f352;
-	st.shared.f32 	[%r1+96], %f354;
-	st.shared.f32 	[%r1+128], %f356;
-	st.shared.f32 	[%r1+160], %f358;
-	st.shared.f32 	[%r1+192], %f360;
-	st.shared.f32 	[%r1+224], %f362;
-	bar.sync 	0;
-	// inline asm
-	mov.u32 	%r151, %tid.x;
-	// inline asm
-	shl.b32 	%r155, %r151, 3;
-	add.s32 	%r157, %r52, %r155;
-	ld.shared.v2.f32 	{%f344, %f345}, [%r157];
-	// inline asm
-	mov.u32 	%r152, %tid.x;
-	// inline asm
-	shl.b32 	%r158, %r152, 3;
-	add.s32 	%r159, %r158, %r52;
-	ld.shared.v2.f32 	{%f282, %f283}, [%r159+1024];
-	// inline asm
-	mov.u32 	%r153, %tid.x;
-	// inline asm
-	shl.b32 	%r160, %r153, 3;
-	add.s32 	%r161, %r160, %r52;
-	ld.shared.v2.f32 	{%f288, %f289}, [%r161+2048];
-	// inline asm
-	mov.u32 	%r154, %tid.x;
-	// inline asm
-	shl.b32 	%r162, %r154, 3;
-	add.s32 	%r163, %r162, %r52;
-	ld.shared.v2.f32 	{%f294, %f295}, [%r163+3072];
-	bar.sync 	0;
-	st.shared.f32 	[%r1], %f349;
-	st.shared.f32 	[%r1+32], %f351;
-	st.shared.f32 	[%r1+64], %f353;
-	st.shared.f32 	[%r1+96], %f355;
-	st.shared.f32 	[%r1+128], %f357;
-	st.shared.f32 	[%r1+160], %f359;
-	st.shared.f32 	[%r1+192], %f361;
-	st.shared.f32 	[%r1+224], %f363;
-	bar.sync 	0;
-	// inline asm
-	mov.u32 	%r164, %tid.x;
-	// inline asm
-	shl.b32 	%r168, %r164, 3;
-	add.s32 	%r170, %r52, %r168;
-	ld.shared.v2.f32 	{%f342, %f343}, [%r170];
-	// inline asm
-	mov.u32 	%r165, %tid.x;
-	// inline asm
-	shl.b32 	%r171, %r165, 3;
-	add.s32 	%r172, %r171, %r52;
-	ld.shared.v2.f32 	{%f284, %f285}, [%r172+1024];
-	// inline asm
-	mov.u32 	%r166, %tid.x;
-	// inline asm
-	shl.b32 	%r173, %r166, 3;
-	add.s32 	%r174, %r173, %r52;
-	ld.shared.v2.f32 	{%f290, %f291}, [%r174+2048];
-	// inline asm
-	mov.u32 	%r167, %tid.x;
-	// inline asm
-	shl.b32 	%r175, %r167, 3;
-	add.s32 	%r176, %r175, %r52;
-	ld.shared.v2.f32 	{%f296, %f297}, [%r176+3072];
-	bar.sync 	0;
-	div.rn.ftz.f32 	%f184, %f15, 0f43800000;
-	// inline asm
-	mov.u32 	%r177, %tid.x;
-	// inline asm
-	shl.b32 	%r180, %r177, 1;
-	and.b32  	%r181, %r180, 62;
-	cvt.rn.f32.u32 	%f185, %r181;
-	mul.ftz.f32 	%f163, %f184, %f185;
-	// inline asm
-	cos.approx.f32 	%f160, %f163;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f162, %f163;
-	// inline asm
-	mul.ftz.f32 	%f187, %f160, %f282;
-	neg.f32 	%f189, %f162;
-	fma.rn.ftz.f32 	%f190, %f189, %f284, %f187;
-	mul.ftz.f32 	%f191, %f162, %f282;
-	fma.rn.ftz.f32 	%f192, %f160, %f284, %f191;
-	fma.rn.ftz.f32 	%f165, %f184, %f185, %f163;
-	// inline asm
-	cos.approx.f32 	%f164, %f165;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f166, %f165;
-	// inline asm
-	mul.ftz.f32 	%f194, %f164, %f288;
-	neg.f32 	%f196, %f166;
-	fma.rn.ftz.f32 	%f197, %f196, %f290, %f194;
-	mul.ftz.f32 	%f198, %f166, %f288;
-	fma.rn.ftz.f32 	%f199, %f164, %f290, %f198;
-	mul.ftz.f32 	%f171, %f163, 0f40400000;
-	// inline asm
-	cos.approx.f32 	%f168, %f171;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f170, %f171;
-	// inline asm
-	mul.ftz.f32 	%f201, %f168, %f294;
-	neg.f32 	%f203, %f170;
-	fma.rn.ftz.f32 	%f204, %f203, %f296, %f201;
-	mul.ftz.f32 	%f205, %f170, %f294;
-	fma.rn.ftz.f32 	%f206, %f168, %f296, %f205;
-	fma.rn.ftz.f32 	%f175, %f184, %f185, %f184;
-	// inline asm
-	cos.approx.f32 	%f172, %f175;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f174, %f175;
-	// inline asm
-	mul.ftz.f32 	%f208, %f172, %f283;
-	neg.f32 	%f210, %f174;
-	fma.rn.ftz.f32 	%f211, %f210, %f285, %f208;
-	mul.ftz.f32 	%f212, %f174, %f283;
-	fma.rn.ftz.f32 	%f213, %f172, %f285, %f212;
-	add.ftz.f32 	%f177, %f175, %f175;
-	// inline asm
-	cos.approx.f32 	%f176, %f177;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f178, %f177;
-	// inline asm
-	mul.ftz.f32 	%f215, %f176, %f289;
-	neg.f32 	%f217, %f178;
-	fma.rn.ftz.f32 	%f218, %f217, %f291, %f215;
-	mul.ftz.f32 	%f219, %f178, %f289;
-	fma.rn.ftz.f32 	%f220, %f176, %f291, %f219;
-	mul.ftz.f32 	%f183, %f175, 0f40400000;
-	// inline asm
-	cos.approx.f32 	%f180, %f183;
-	// inline asm
-	// inline asm
-	sin.approx.f32 	%f182, %f183;
-	// inline asm
-	mul.ftz.f32 	%f222, %f180, %f295;
-	neg.f32 	%f224, %f182;
-	fma.rn.ftz.f32 	%f225, %f224, %f297, %f222;
-	mul.ftz.f32 	%f226, %f182, %f295;
-	fma.rn.ftz.f32 	%f227, %f180, %f297, %f226;
-	add.ftz.f32 	%f306, %f344, %f197;
-	add.ftz.f32 	%f307, %f342, %f199;
-	sub.ftz.f32 	%f310, %f344, %f197;
-	sub.ftz.f32 	%f311, %f342, %f199;
-	add.ftz.f32 	%f312, %f190, %f204;
-	add.ftz.f32 	%f313, %f192, %f206;
-	sub.ftz.f32 	%f314, %f190, %f204;
-	sub.ftz.f32 	%f315, %f192, %f206;
-	add.ftz.f32 	%f260, %f306, %f312;
-	add.ftz.f32 	%f261, %f307, %f313;
-	sub.ftz.f32 	%f272, %f306, %f312;
-	sub.ftz.f32 	%f273, %f307, %f313;
-	neg.ftz.f32 	%f229, %f315;
-	add.ftz.f32 	%f266, %f310, %f229;
-	add.ftz.f32 	%f267, %f311, %f314;
-	sub.ftz.f32 	%f278, %f310, %f229;
-	sub.ftz.f32 	%f279, %f311, %f314;
-	add.ftz.f32 	%f320, %f345, %f218;
-	add.ftz.f32 	%f321, %f343, %f220;
-	sub.ftz.f32 	%f324, %f345, %f218;
-	sub.ftz.f32 	%f325, %f343, %f220;
-	add.ftz.f32 	%f326, %f211, %f225;
-	add.ftz.f32 	%f327, %f213, %f227;
-	sub.ftz.f32 	%f328, %f211, %f225;
-	sub.ftz.f32 	%f329, %f213, %f227;
-	add.ftz.f32 	%f262, %f320, %f326;
-	add.ftz.f32 	%f263, %f321, %f327;
-	sub.ftz.f32 	%f274, %f320, %f326;
-	sub.ftz.f32 	%f275, %f321, %f327;
-	neg.ftz.f32 	%f232, %f329;
-	add.ftz.f32 	%f268, %f324, %f232;
-	add.ftz.f32 	%f269, %f325, %f328;
-	sub.ftz.f32 	%f280, %f324, %f232;
-	sub.ftz.f32 	%f281, %f325, %f328;
-	// inline asm
-	mov.u32 	%r178, %tid.x;
-	// inline asm
-	shl.b32 	%r182, %r178, 2;
-	// inline asm
-	mov.u32 	%r179, %tid.x;
-	// inline asm
-	and.b32  	%r183, %r179, 31;
-	and.b32  	%r184, %r182, 536870784;
-	add.s32 	%r185, %r184, %r183;
-	shl.b32 	%r186, %r185, 3;
-	add.s32 	%r188, %r52, %r186;
-	st.shared.v2.f32 	[%r188], {%f260, %f262};
-	st.shared.v2.f32 	[%r188+256], {%f266, %f268};
-	st.shared.v2.f32 	[%r188+512], {%f272, %f274};
-	st.shared.v2.f32 	[%r188+768], {%f278, %f280};
-	bar.sync 	0;
-	// inline asm
-	mov.u32 	%r193, %tid.x;
-	// inline asm
-	shl.b32 	%r197, %r193, 3;
-	add.s32 	%r199, %r52, %r197;
-	st.shared.v2.f32 	[%r199], {%f261, %f263};
-	// inline asm
-	mov.u32 	%r194, %tid.x;
-	// inline asm
-	shl.b32 	%r200, %r194, 3;
-	add.s32 	%r201, %r200, %r52;
-	st.shared.v2.f32 	[%r201+1024], {%f267, %f269};
-	// inline asm
-	mov.u32 	%r195, %tid.x;
-	// inline asm
-	shl.b32 	%r202, %r195, 3;
-	add.s32 	%r203, %r202, %r52;
-	st.shared.v2.f32 	[%r203+2048], {%f273, %f275};
-	// inline asm
-	mov.u32 	%r196, %tid.x;
-	// inline asm
-	shl.b32 	%r204, %r196, 3;
-	add.s32 	%r205, %r204, %r52;
-	st.shared.v2.f32 	[%r205+3072], {%f279, %f281};
-	bar.sync 	0;
-	ret;
-}
-
-
diff --git a/RTCP/GPUProc/src/UHEP/InvFIR.cl-0.ptx b/RTCP/GPUProc/src/UHEP/InvFIR.cl-0.ptx
deleted file mode 100644
index 2179b103dc77531446838556584310d90c722f7b..0000000000000000000000000000000000000000
--- a/RTCP/GPUProc/src/UHEP/InvFIR.cl-0.ptx
+++ /dev/null
@@ -1,447 +0,0 @@
-//
-// Generated by NVIDIA NVVM Compiler
-// Compiler built on Tue Feb  7 07:15:59 2012 (1328595359)
-// Driver 295.20
-//
-
-.version 3.0
-.target sm_21, texmode_independent
-.address_size 32
-
-
-.entry invFIRfilter(
-	.param .u32 .ptr .global .align 1 invFIRfilter_param_0,
-	.param .u32 .ptr .global .align 1 invFIRfilter_param_1,
-	.param .u32 .ptr .global .align 1 invFIRfilter_param_2
-)
-{
-	.reg .f32 	%f<386>;
-	.reg .pred 	%p<2>;
-	.reg .s32 	%r<71>;
-
-
-	ld.param.u32 	%r21, [invFIRfilter_param_0];
-	ld.param.u32 	%r22, [invFIRfilter_param_1];
-	ld.param.u32 	%r23, [invFIRfilter_param_2];
-	// inline asm
-	mov.u32 	%r7, %envreg3;
-	// inline asm
-	// inline asm
-	mov.u32 	%r8, %ntid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r9, %ctaid.x;
-	// inline asm
-	mul.lo.s32 	%r24, %r9, %r8;
-	// inline asm
-	mov.u32 	%r10, %tid.x;
-	// inline asm
-	add.s32 	%r25, %r10, %r7;
-	mad.lo.s32 	%r26, %r9, %r8, %r25;
-	// inline asm
-	mov.u32 	%r11, %envreg4;
-	// inline asm
-	// inline asm
-	mov.u32 	%r12, %ntid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r13, %ctaid.y;
-	// inline asm
-	mul.lo.s32 	%r27, %r13, %r12;
-	// inline asm
-	mov.u32 	%r14, %tid.y;
-	// inline asm
-	add.s32 	%r28, %r14, %r11;
-	mad.lo.s32 	%r29, %r13, %r12, %r28;
-	// inline asm
-	mov.u32 	%r15, %envreg5;
-	// inline asm
-	// inline asm
-	mov.u32 	%r16, %ntid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r17, %ctaid.z;
-	// inline asm
-	mul.lo.s32 	%r30, %r17, %r16;
-	// inline asm
-	mov.u32 	%r18, %tid.z;
-	// inline asm
-	add.s32 	%r31, %r18, %r15;
-	mad.lo.s32 	%r32, %r17, %r16, %r31;
-	shl.b32 	%r33, %r26, 6;
-	add.s32 	%r34, %r23, %r33;
-	mad.lo.s32 	%r35, %r32, 8511488, %r22;
-	mad.lo.s32 	%r36, %r29, 4255744, %r35;
-	shl.b32 	%r37, %r26, 2;
-	add.s32 	%r38, %r36, %r37;
-	ld.global.f32 	%f17, [%r38];
-	mov.u32 	%r70, 0;
-	ld.global.f32 	%f18, [%r38+4096];
-	ld.global.f32 	%f19, [%r38+8192];
-	ld.global.f32 	%f20, [%r38+12288];
-	ld.global.f32 	%f21, [%r38+16384];
-	ld.global.f32 	%f22, [%r38+20480];
-	ld.global.f32 	%f23, [%r38+24576];
-	ld.global.f32 	%f24, [%r38+28672];
-	ld.global.f32 	%f25, [%r38+32768];
-	ld.global.f32 	%f26, [%r38+36864];
-	ld.global.f32 	%f27, [%r38+40960];
-	ld.global.f32 	%f28, [%r38+45056];
-	ld.global.f32 	%f29, [%r38+49152];
-	ld.global.f32 	%f30, [%r38+53248];
-	ld.global.f32 	%f31, [%r38+57344];
-	ld.global.v4.f32 	{%f367, %f368, %f369, %f370}, [%r34+48];
-	ld.global.v4.f32 	{%f371, %f372, %f373, %f374}, [%r34+32];
-	ld.global.v4.f32 	{%f375, %f376, %f377, %f378}, [%r34+16];
-	ld.global.v4.f32 	{%f379, %f380, %f381, %f382}, [%r34];
-	shl.b32 	%r39, %r30, 23;
-	shl.b32 	%r40, %r15, 23;
-	add.s32 	%r41, %r39, %r40;
-	shl.b32 	%r42, %r18, 23;
-	add.s32 	%r43, %r41, %r42;
-	shl.b32 	%r44, %r27, 22;
-	add.s32 	%r45, %r43, %r44;
-	shl.b32 	%r46, %r11, 22;
-	add.s32 	%r47, %r45, %r46;
-	shl.b32 	%r48, %r14, 22;
-	add.s32 	%r49, %r47, %r48;
-	shl.b32 	%r50, %r24, 2;
-	add.s32 	%r51, %r49, %r50;
-	shl.b32 	%r52, %r7, 2;
-	add.s32 	%r53, %r51, %r52;
-	shl.b32 	%r54, %r10, 2;
-	add.s32 	%r55, %r53, %r54;
-	add.s32 	%r56, %r55, %r21;
-	add.s32 	%r1, %r56, 32768;
-	mul.lo.s32 	%r57, %r15, 8511488;
-	mad.lo.s32 	%r58, %r30, 8511488, %r57;
-	mad.lo.s32 	%r59, %r18, 8511488, %r58;
-	mad.lo.s32 	%r60, %r27, 4255744, %r59;
-	mad.lo.s32 	%r61, %r11, 4255744, %r60;
-	mad.lo.s32 	%r62, %r14, 4255744, %r61;
-	add.s32 	%r63, %r62, %r50;
-	add.s32 	%r64, %r63, %r52;
-	add.s32 	%r65, %r64, %r54;
-	add.s32 	%r66, %r65, %r22;
-	add.s32 	%r2, %r66, 122880;
-	mov.f32 	%f347, %f25;
-	mov.f32 	%f348, %f26;
-	mov.f32 	%f349, %f27;
-	mov.f32 	%f350, %f28;
-	mov.f32 	%f335, %f21;
-	mov.f32 	%f336, %f22;
-	mov.f32 	%f337, %f23;
-	mov.f32 	%f338, %f24;
-	mov.f32 	%f323, %f17;
-	mov.f32 	%f324, %f18;
-	mov.f32 	%f325, %f19;
-	mov.f32 	%f326, %f20;
-	mov.f32 	%f383, %f29;
-	mov.f32 	%f384, %f30;
-	mov.f32 	%f385, %f31;
-	mov.f32 	%f362, %f32;
-	mov.u32 	%r69, 64;
-
-BB0_1:
-	mul.ftz.f32 	%f35, %f369, %f324;
-	fma.rn.ftz.f32 	%f36, %f370, %f323, %f35;
-	fma.rn.ftz.f32 	%f38, %f368, %f325, %f36;
-	fma.rn.ftz.f32 	%f40, %f367, %f326, %f38;
-	fma.rn.ftz.f32 	%f42, %f374, %f335, %f40;
-	fma.rn.ftz.f32 	%f44, %f373, %f336, %f42;
-	fma.rn.ftz.f32 	%f46, %f372, %f337, %f44;
-	fma.rn.ftz.f32 	%f48, %f371, %f338, %f46;
-	fma.rn.ftz.f32 	%f50, %f378, %f347, %f48;
-	fma.rn.ftz.f32 	%f52, %f377, %f348, %f50;
-	fma.rn.ftz.f32 	%f54, %f376, %f349, %f52;
-	fma.rn.ftz.f32 	%f56, %f375, %f350, %f54;
-	fma.rn.ftz.f32 	%f58, %f382, %f383, %f56;
-	fma.rn.ftz.f32 	%f60, %f381, %f384, %f58;
-	fma.rn.ftz.f32 	%f62, %f380, %f385, %f60;
-	add.s32 	%r67, %r2, %r70;
-	ld.global.f32 	%f63, [%r67+-57344];
-	fma.rn.ftz.f32 	%f64, %f379, %f63, %f62;
-	add.s32 	%r68, %r1, %r70;
-	st.global.f32 	[%r68+-32768], %f64;
-	mul.ftz.f32 	%f65, %f369, %f325;
-	fma.rn.ftz.f32 	%f66, %f370, %f324, %f65;
-	fma.rn.ftz.f32 	%f67, %f368, %f326, %f66;
-	fma.rn.ftz.f32 	%f68, %f367, %f335, %f67;
-	fma.rn.ftz.f32 	%f69, %f374, %f336, %f68;
-	fma.rn.ftz.f32 	%f70, %f373, %f337, %f69;
-	fma.rn.ftz.f32 	%f71, %f372, %f338, %f70;
-	fma.rn.ftz.f32 	%f72, %f371, %f347, %f71;
-	fma.rn.ftz.f32 	%f73, %f378, %f348, %f72;
-	fma.rn.ftz.f32 	%f74, %f377, %f349, %f73;
-	fma.rn.ftz.f32 	%f75, %f376, %f350, %f74;
-	fma.rn.ftz.f32 	%f76, %f375, %f383, %f75;
-	fma.rn.ftz.f32 	%f77, %f382, %f384, %f76;
-	fma.rn.ftz.f32 	%f78, %f381, %f385, %f77;
-	ld.global.f32 	%f79, [%r67+-53248];
-	fma.rn.ftz.f32 	%f80, %f380, %f79, %f78;
-	fma.rn.ftz.f32 	%f81, %f379, %f323, %f80;
-	st.global.f32 	[%r68+-28672], %f81;
-	mul.ftz.f32 	%f82, %f369, %f326;
-	fma.rn.ftz.f32 	%f83, %f370, %f325, %f82;
-	fma.rn.ftz.f32 	%f84, %f368, %f335, %f83;
-	fma.rn.ftz.f32 	%f85, %f367, %f336, %f84;
-	fma.rn.ftz.f32 	%f86, %f374, %f337, %f85;
-	fma.rn.ftz.f32 	%f87, %f373, %f338, %f86;
-	fma.rn.ftz.f32 	%f88, %f372, %f347, %f87;
-	fma.rn.ftz.f32 	%f89, %f371, %f348, %f88;
-	fma.rn.ftz.f32 	%f90, %f378, %f349, %f89;
-	fma.rn.ftz.f32 	%f91, %f377, %f350, %f90;
-	fma.rn.ftz.f32 	%f92, %f376, %f383, %f91;
-	fma.rn.ftz.f32 	%f93, %f375, %f384, %f92;
-	fma.rn.ftz.f32 	%f94, %f382, %f385, %f93;
-	ld.global.f32 	%f95, [%r67+-49152];
-	fma.rn.ftz.f32 	%f96, %f381, %f95, %f94;
-	fma.rn.ftz.f32 	%f97, %f380, %f323, %f96;
-	fma.rn.ftz.f32 	%f98, %f379, %f324, %f97;
-	st.global.f32 	[%r68+-24576], %f98;
-	mul.ftz.f32 	%f99, %f369, %f335;
-	fma.rn.ftz.f32 	%f100, %f370, %f326, %f99;
-	fma.rn.ftz.f32 	%f101, %f368, %f336, %f100;
-	fma.rn.ftz.f32 	%f102, %f367, %f337, %f101;
-	fma.rn.ftz.f32 	%f103, %f374, %f338, %f102;
-	fma.rn.ftz.f32 	%f104, %f373, %f347, %f103;
-	fma.rn.ftz.f32 	%f105, %f372, %f348, %f104;
-	fma.rn.ftz.f32 	%f106, %f371, %f349, %f105;
-	fma.rn.ftz.f32 	%f107, %f378, %f350, %f106;
-	fma.rn.ftz.f32 	%f108, %f377, %f383, %f107;
-	fma.rn.ftz.f32 	%f109, %f376, %f384, %f108;
-	fma.rn.ftz.f32 	%f110, %f375, %f385, %f109;
-	ld.global.f32 	%f111, [%r67+-45056];
-	fma.rn.ftz.f32 	%f112, %f382, %f111, %f110;
-	fma.rn.ftz.f32 	%f113, %f381, %f323, %f112;
-	fma.rn.ftz.f32 	%f114, %f380, %f324, %f113;
-	fma.rn.ftz.f32 	%f115, %f379, %f325, %f114;
-	st.global.f32 	[%r68+-20480], %f115;
-	mul.ftz.f32 	%f116, %f369, %f336;
-	fma.rn.ftz.f32 	%f117, %f370, %f335, %f116;
-	fma.rn.ftz.f32 	%f118, %f368, %f337, %f117;
-	fma.rn.ftz.f32 	%f119, %f367, %f338, %f118;
-	fma.rn.ftz.f32 	%f120, %f374, %f347, %f119;
-	fma.rn.ftz.f32 	%f121, %f373, %f348, %f120;
-	fma.rn.ftz.f32 	%f122, %f372, %f349, %f121;
-	fma.rn.ftz.f32 	%f123, %f371, %f350, %f122;
-	fma.rn.ftz.f32 	%f124, %f378, %f383, %f123;
-	fma.rn.ftz.f32 	%f125, %f377, %f384, %f124;
-	fma.rn.ftz.f32 	%f126, %f376, %f385, %f125;
-	ld.global.f32 	%f127, [%r67+-40960];
-	fma.rn.ftz.f32 	%f128, %f375, %f127, %f126;
-	fma.rn.ftz.f32 	%f129, %f382, %f323, %f128;
-	fma.rn.ftz.f32 	%f130, %f381, %f324, %f129;
-	fma.rn.ftz.f32 	%f131, %f380, %f325, %f130;
-	fma.rn.ftz.f32 	%f132, %f379, %f326, %f131;
-	st.global.f32 	[%r68+-16384], %f132;
-	mul.ftz.f32 	%f133, %f369, %f337;
-	fma.rn.ftz.f32 	%f134, %f370, %f336, %f133;
-	fma.rn.ftz.f32 	%f135, %f368, %f338, %f134;
-	fma.rn.ftz.f32 	%f136, %f367, %f347, %f135;
-	fma.rn.ftz.f32 	%f137, %f374, %f348, %f136;
-	fma.rn.ftz.f32 	%f138, %f373, %f349, %f137;
-	fma.rn.ftz.f32 	%f139, %f372, %f350, %f138;
-	fma.rn.ftz.f32 	%f140, %f371, %f383, %f139;
-	fma.rn.ftz.f32 	%f141, %f378, %f384, %f140;
-	fma.rn.ftz.f32 	%f142, %f377, %f385, %f141;
-	ld.global.f32 	%f143, [%r67+-36864];
-	fma.rn.ftz.f32 	%f144, %f376, %f143, %f142;
-	fma.rn.ftz.f32 	%f145, %f375, %f323, %f144;
-	fma.rn.ftz.f32 	%f146, %f382, %f324, %f145;
-	fma.rn.ftz.f32 	%f147, %f381, %f325, %f146;
-	fma.rn.ftz.f32 	%f148, %f380, %f326, %f147;
-	fma.rn.ftz.f32 	%f149, %f379, %f335, %f148;
-	st.global.f32 	[%r68+-12288], %f149;
-	mul.ftz.f32 	%f150, %f369, %f338;
-	fma.rn.ftz.f32 	%f151, %f370, %f337, %f150;
-	fma.rn.ftz.f32 	%f152, %f368, %f347, %f151;
-	fma.rn.ftz.f32 	%f153, %f367, %f348, %f152;
-	fma.rn.ftz.f32 	%f154, %f374, %f349, %f153;
-	fma.rn.ftz.f32 	%f155, %f373, %f350, %f154;
-	fma.rn.ftz.f32 	%f156, %f372, %f383, %f155;
-	fma.rn.ftz.f32 	%f157, %f371, %f384, %f156;
-	fma.rn.ftz.f32 	%f158, %f378, %f385, %f157;
-	ld.global.f32 	%f159, [%r67+-32768];
-	fma.rn.ftz.f32 	%f160, %f377, %f159, %f158;
-	fma.rn.ftz.f32 	%f161, %f376, %f323, %f160;
-	fma.rn.ftz.f32 	%f162, %f375, %f324, %f161;
-	fma.rn.ftz.f32 	%f163, %f382, %f325, %f162;
-	fma.rn.ftz.f32 	%f164, %f381, %f326, %f163;
-	fma.rn.ftz.f32 	%f165, %f380, %f335, %f164;
-	fma.rn.ftz.f32 	%f166, %f379, %f336, %f165;
-	st.global.f32 	[%r68+-8192], %f166;
-	mul.ftz.f32 	%f167, %f369, %f347;
-	fma.rn.ftz.f32 	%f168, %f370, %f338, %f167;
-	fma.rn.ftz.f32 	%f169, %f368, %f348, %f168;
-	fma.rn.ftz.f32 	%f170, %f367, %f349, %f169;
-	fma.rn.ftz.f32 	%f171, %f374, %f350, %f170;
-	fma.rn.ftz.f32 	%f172, %f373, %f383, %f171;
-	fma.rn.ftz.f32 	%f173, %f372, %f384, %f172;
-	fma.rn.ftz.f32 	%f174, %f371, %f385, %f173;
-	ld.global.f32 	%f175, [%r67+-28672];
-	fma.rn.ftz.f32 	%f176, %f378, %f175, %f174;
-	fma.rn.ftz.f32 	%f177, %f377, %f323, %f176;
-	fma.rn.ftz.f32 	%f178, %f376, %f324, %f177;
-	fma.rn.ftz.f32 	%f179, %f375, %f325, %f178;
-	fma.rn.ftz.f32 	%f180, %f382, %f326, %f179;
-	fma.rn.ftz.f32 	%f181, %f381, %f335, %f180;
-	fma.rn.ftz.f32 	%f182, %f380, %f336, %f181;
-	fma.rn.ftz.f32 	%f183, %f379, %f337, %f182;
-	st.global.f32 	[%r68+-4096], %f183;
-	mul.ftz.f32 	%f184, %f369, %f348;
-	fma.rn.ftz.f32 	%f185, %f370, %f347, %f184;
-	fma.rn.ftz.f32 	%f186, %f368, %f349, %f185;
-	fma.rn.ftz.f32 	%f187, %f367, %f350, %f186;
-	fma.rn.ftz.f32 	%f188, %f374, %f383, %f187;
-	fma.rn.ftz.f32 	%f189, %f373, %f384, %f188;
-	fma.rn.ftz.f32 	%f190, %f372, %f385, %f189;
-	ld.global.f32 	%f191, [%r67+-24576];
-	fma.rn.ftz.f32 	%f192, %f371, %f191, %f190;
-	fma.rn.ftz.f32 	%f193, %f378, %f323, %f192;
-	fma.rn.ftz.f32 	%f194, %f377, %f324, %f193;
-	fma.rn.ftz.f32 	%f195, %f376, %f325, %f194;
-	fma.rn.ftz.f32 	%f196, %f375, %f326, %f195;
-	fma.rn.ftz.f32 	%f197, %f382, %f335, %f196;
-	fma.rn.ftz.f32 	%f198, %f381, %f336, %f197;
-	fma.rn.ftz.f32 	%f199, %f380, %f337, %f198;
-	fma.rn.ftz.f32 	%f200, %f379, %f338, %f199;
-	st.global.f32 	[%r68], %f200;
-	mul.ftz.f32 	%f201, %f369, %f349;
-	fma.rn.ftz.f32 	%f202, %f370, %f348, %f201;
-	fma.rn.ftz.f32 	%f203, %f368, %f350, %f202;
-	fma.rn.ftz.f32 	%f204, %f367, %f383, %f203;
-	fma.rn.ftz.f32 	%f205, %f374, %f384, %f204;
-	fma.rn.ftz.f32 	%f206, %f373, %f385, %f205;
-	ld.global.f32 	%f207, [%r67+-20480];
-	fma.rn.ftz.f32 	%f208, %f372, %f207, %f206;
-	fma.rn.ftz.f32 	%f209, %f371, %f323, %f208;
-	fma.rn.ftz.f32 	%f210, %f378, %f324, %f209;
-	fma.rn.ftz.f32 	%f211, %f377, %f325, %f210;
-	fma.rn.ftz.f32 	%f212, %f376, %f326, %f211;
-	fma.rn.ftz.f32 	%f213, %f375, %f335, %f212;
-	fma.rn.ftz.f32 	%f214, %f382, %f336, %f213;
-	fma.rn.ftz.f32 	%f215, %f381, %f337, %f214;
-	fma.rn.ftz.f32 	%f216, %f380, %f338, %f215;
-	fma.rn.ftz.f32 	%f217, %f379, %f347, %f216;
-	st.global.f32 	[%r68+4096], %f217;
-	mul.ftz.f32 	%f218, %f369, %f350;
-	fma.rn.ftz.f32 	%f219, %f370, %f349, %f218;
-	fma.rn.ftz.f32 	%f220, %f368, %f383, %f219;
-	fma.rn.ftz.f32 	%f221, %f367, %f384, %f220;
-	fma.rn.ftz.f32 	%f222, %f374, %f385, %f221;
-	ld.global.f32 	%f223, [%r67+-16384];
-	fma.rn.ftz.f32 	%f224, %f373, %f223, %f222;
-	fma.rn.ftz.f32 	%f225, %f372, %f323, %f224;
-	fma.rn.ftz.f32 	%f226, %f371, %f324, %f225;
-	fma.rn.ftz.f32 	%f227, %f378, %f325, %f226;
-	fma.rn.ftz.f32 	%f228, %f377, %f326, %f227;
-	fma.rn.ftz.f32 	%f229, %f376, %f335, %f228;
-	fma.rn.ftz.f32 	%f230, %f375, %f336, %f229;
-	fma.rn.ftz.f32 	%f231, %f382, %f337, %f230;
-	fma.rn.ftz.f32 	%f232, %f381, %f338, %f231;
-	fma.rn.ftz.f32 	%f233, %f380, %f347, %f232;
-	fma.rn.ftz.f32 	%f234, %f379, %f348, %f233;
-	st.global.f32 	[%r68+8192], %f234;
-	mul.ftz.f32 	%f235, %f369, %f383;
-	fma.rn.ftz.f32 	%f236, %f370, %f350, %f235;
-	fma.rn.ftz.f32 	%f237, %f368, %f384, %f236;
-	fma.rn.ftz.f32 	%f238, %f367, %f385, %f237;
-	ld.global.f32 	%f239, [%r67+-12288];
-	fma.rn.ftz.f32 	%f240, %f374, %f239, %f238;
-	fma.rn.ftz.f32 	%f241, %f373, %f323, %f240;
-	fma.rn.ftz.f32 	%f242, %f372, %f324, %f241;
-	fma.rn.ftz.f32 	%f243, %f371, %f325, %f242;
-	fma.rn.ftz.f32 	%f244, %f378, %f326, %f243;
-	fma.rn.ftz.f32 	%f245, %f377, %f335, %f244;
-	fma.rn.ftz.f32 	%f246, %f376, %f336, %f245;
-	fma.rn.ftz.f32 	%f247, %f375, %f337, %f246;
-	fma.rn.ftz.f32 	%f248, %f382, %f338, %f247;
-	fma.rn.ftz.f32 	%f249, %f381, %f347, %f248;
-	fma.rn.ftz.f32 	%f250, %f380, %f348, %f249;
-	fma.rn.ftz.f32 	%f251, %f379, %f349, %f250;
-	st.global.f32 	[%r68+12288], %f251;
-	mul.ftz.f32 	%f252, %f369, %f384;
-	fma.rn.ftz.f32 	%f253, %f370, %f383, %f252;
-	fma.rn.ftz.f32 	%f254, %f368, %f385, %f253;
-	ld.global.f32 	%f255, [%r67+-8192];
-	fma.rn.ftz.f32 	%f256, %f367, %f255, %f254;
-	fma.rn.ftz.f32 	%f257, %f374, %f323, %f256;
-	fma.rn.ftz.f32 	%f258, %f373, %f324, %f257;
-	fma.rn.ftz.f32 	%f259, %f372, %f325, %f258;
-	fma.rn.ftz.f32 	%f260, %f371, %f326, %f259;
-	fma.rn.ftz.f32 	%f261, %f378, %f335, %f260;
-	fma.rn.ftz.f32 	%f262, %f377, %f336, %f261;
-	fma.rn.ftz.f32 	%f263, %f376, %f337, %f262;
-	fma.rn.ftz.f32 	%f264, %f375, %f338, %f263;
-	fma.rn.ftz.f32 	%f265, %f382, %f347, %f264;
-	fma.rn.ftz.f32 	%f266, %f381, %f348, %f265;
-	fma.rn.ftz.f32 	%f267, %f380, %f349, %f266;
-	fma.rn.ftz.f32 	%f268, %f379, %f350, %f267;
-	st.global.f32 	[%r68+16384], %f268;
-	mul.ftz.f32 	%f269, %f369, %f385;
-	fma.rn.ftz.f32 	%f270, %f370, %f384, %f269;
-	ld.global.f32 	%f271, [%r67+-4096];
-	fma.rn.ftz.f32 	%f272, %f368, %f271, %f270;
-	fma.rn.ftz.f32 	%f273, %f367, %f323, %f272;
-	fma.rn.ftz.f32 	%f274, %f374, %f324, %f273;
-	fma.rn.ftz.f32 	%f275, %f373, %f325, %f274;
-	fma.rn.ftz.f32 	%f276, %f372, %f326, %f275;
-	fma.rn.ftz.f32 	%f277, %f371, %f335, %f276;
-	fma.rn.ftz.f32 	%f278, %f378, %f336, %f277;
-	fma.rn.ftz.f32 	%f279, %f377, %f337, %f278;
-	fma.rn.ftz.f32 	%f280, %f376, %f338, %f279;
-	fma.rn.ftz.f32 	%f281, %f375, %f347, %f280;
-	fma.rn.ftz.f32 	%f282, %f382, %f348, %f281;
-	fma.rn.ftz.f32 	%f283, %f381, %f349, %f282;
-	fma.rn.ftz.f32 	%f284, %f380, %f350, %f283;
-	fma.rn.ftz.f32 	%f285, %f379, %f383, %f284;
-	st.global.f32 	[%r68+20480], %f285;
-	ld.global.f32 	%f286, [%r67];
-	mul.ftz.f32 	%f287, %f369, %f286;
-	fma.rn.ftz.f32 	%f288, %f370, %f385, %f287;
-	fma.rn.ftz.f32 	%f289, %f368, %f323, %f288;
-	fma.rn.ftz.f32 	%f290, %f367, %f324, %f289;
-	fma.rn.ftz.f32 	%f291, %f374, %f325, %f290;
-	fma.rn.ftz.f32 	%f292, %f373, %f326, %f291;
-	fma.rn.ftz.f32 	%f293, %f372, %f335, %f292;
-	fma.rn.ftz.f32 	%f294, %f371, %f336, %f293;
-	fma.rn.ftz.f32 	%f295, %f378, %f337, %f294;
-	fma.rn.ftz.f32 	%f296, %f377, %f338, %f295;
-	fma.rn.ftz.f32 	%f297, %f376, %f347, %f296;
-	fma.rn.ftz.f32 	%f298, %f375, %f348, %f297;
-	fma.rn.ftz.f32 	%f299, %f382, %f349, %f298;
-	fma.rn.ftz.f32 	%f300, %f381, %f350, %f299;
-	fma.rn.ftz.f32 	%f301, %f380, %f383, %f300;
-	fma.rn.ftz.f32 	%f302, %f379, %f384, %f301;
-	st.global.f32 	[%r68+24576], %f302;
-	mul.ftz.f32 	%f303, %f369, %f323;
-	fma.rn.ftz.f32 	%f304, %f370, %f286, %f303;
-	fma.rn.ftz.f32 	%f305, %f368, %f324, %f304;
-	fma.rn.ftz.f32 	%f306, %f367, %f325, %f305;
-	fma.rn.ftz.f32 	%f307, %f374, %f326, %f306;
-	fma.rn.ftz.f32 	%f308, %f373, %f335, %f307;
-	fma.rn.ftz.f32 	%f309, %f372, %f336, %f308;
-	fma.rn.ftz.f32 	%f310, %f371, %f337, %f309;
-	fma.rn.ftz.f32 	%f311, %f378, %f338, %f310;
-	fma.rn.ftz.f32 	%f312, %f377, %f347, %f311;
-	fma.rn.ftz.f32 	%f313, %f376, %f348, %f312;
-	fma.rn.ftz.f32 	%f314, %f375, %f349, %f313;
-	fma.rn.ftz.f32 	%f315, %f382, %f350, %f314;
-	fma.rn.ftz.f32 	%f316, %f381, %f383, %f315;
-	fma.rn.ftz.f32 	%f317, %f380, %f384, %f316;
-	fma.rn.ftz.f32 	%f318, %f379, %f385, %f317;
-	st.global.f32 	[%r68+28672], %f318;
-	add.s32 	%r70, %r70, 65536;
-	add.s32 	%r69, %r69, -1;
-	setp.ne.s32 	%p1, %r69, 0;
-	mov.f32 	%f383, %f383;
-	mov.f32 	%f384, %f384;
-	mov.f32 	%f385, %f385;
-	mov.f32 	%f366, %f286;
-	@%p1 bra 	BB0_1;
-
-	ret;
-}
-
-
diff --git a/RTCP/GPUProc/src/UHEP/Transpose.cl-0.ptx b/RTCP/GPUProc/src/UHEP/Transpose.cl-0.ptx
deleted file mode 100644
index cb8248be325fa81a44e309be3a1a84ebe64e5e75..0000000000000000000000000000000000000000
--- a/RTCP/GPUProc/src/UHEP/Transpose.cl-0.ptx
+++ /dev/null
@@ -1,142 +0,0 @@
-//
-// Generated by NVIDIA NVVM Compiler
-// Compiler built on Tue Feb  7 07:15:59 2012 (1328595359)
-// Driver 295.20
-//
-
-.version 3.0
-.target sm_21, texmode_independent
-.address_size 32
-
-.extern .shared .align 16 .b8 shr_1_tmp[4352];
-
-.entry UHEP_Transpose(
-	.param .u32 .ptr .global .align 1 UHEP_Transpose_param_0,
-	.param .u32 .ptr .global .align 1 UHEP_Transpose_param_1,
-	.param .u32 .ptr .global .align 4 UHEP_Transpose_param_2
-)
-{
-	.reg .f32 	%f<30>;
-	.reg .pred 	%p<7>;
-	.reg .s32 	%r<61>;
-
-
-	ld.param.u32 	%r25, [UHEP_Transpose_param_0];
-	ld.param.u32 	%r26, [UHEP_Transpose_param_1];
-	ld.param.u32 	%r27, [UHEP_Transpose_param_2];
-	// inline asm
-	mov.u32 	%r12, %envreg4;
-	// inline asm
-	// inline asm
-	mov.u32 	%r13, %ntid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r14, %ctaid.y;
-	// inline asm
-	// inline asm
-	mov.u32 	%r15, %tid.y;
-	// inline asm
-	add.s32 	%r28, %r15, %r12;
-	mad.lo.s32 	%r29, %r14, %r13, %r28;
-	// inline asm
-	mov.u32 	%r16, %envreg5;
-	// inline asm
-	// inline asm
-	mov.u32 	%r17, %ntid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r18, %ctaid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r19, %tid.z;
-	// inline asm
-	add.s32 	%r30, %r19, %r16;
-	mad.lo.s32 	%r31, %r18, %r17, %r30;
-	// inline asm
-	mov.u32 	%r20, %tid.x;
-	// inline asm
-	shl.b32 	%r32, %r29, 4;
-	and.b32  	%r33, %r20, 15;
-	add.s32 	%r34, %r32, %r33;
-	// inline asm
-	mov.u32 	%r21, %tid.x;
-	// inline asm
-	shr.u32 	%r35, %r21, 4;
-	shl.b32 	%r36, %r31, 4;
-	add.s32 	%r37, %r36, %r35;
-	shl.b32 	%r38, %r37, 2;
-	add.s32 	%r39, %r27, %r38;
-	setp.lt.u32 	%p4, %r34, 4;
-	ld.global.u32 	%r40, [%r39];
-	setp.gt.s32 	%p5, %r40, -1;
-	and.pred  	%p1, %p4, %p5;
-	// inline asm
-	mov.u32 	%r22, %tid.x;
-	// inline asm
-	shr.u32 	%r41, %r22, 4;
-	// inline asm
-	mov.u32 	%r23, %tid.x;
-	// inline asm
-	and.b32  	%r42, %r23, 15;
-	add.s32 	%r43, %r36, %r42;
-	shl.b32 	%r44, %r43, 2;
-	add.s32 	%r45, %r27, %r44;
-	add.s32 	%r46, %r41, %r32;
-	setp.lt.u32 	%p2, %r46, 4;
-	mov.u32 	%r47, shr_1_tmp;
-	mad.lo.s32 	%r48, %r33, 272, %r47;
-	and.b32  	%r49, %r21, -16;
-	add.s32 	%r1, %r48, %r49;
-	ld.global.u32 	%r50, [%r45];
-	setp.gt.s32 	%p3, %r50, -1;
-	mad.lo.s32 	%r51, %r41, 272, %r47;
-	shl.b32 	%r52, %r42, 4;
-	add.s32 	%r2, %r51, %r52;
-	shl.b32 	%r53, %r43, 3;
-	mad.lo.s32 	%r54, %r46, 8511488, %r53;
-	add.s32 	%r55, %r54, %r25;
-	add.s32 	%r60, %r55, 4255744;
-	shl.b32 	%r56, %r34, 4;
-	mad.lo.s32 	%r57, %r40, 66496, %r56;
-	add.s32 	%r59, %r26, %r57;
-	mov.u32 	%r58, 1039;
-
-BB0_1:
-	add.s32 	%r8, %r60, -4255744;
-	@!%p1 bra 	BB0_3;
-
-	ld.global.v4.f32 	{%f22, %f23, %f24, %f25}, [%r59];
-	st.shared.v4.f32 	[%r1], {%f22, %f23, %f24, %f25};
-
-BB0_3:
-	bar.sync 	0;
-	@!%p2 bra 	BB0_8;
-
-	@%p3 bra 	BB0_6;
-
-	mov.f32 	%f1, 0f00000000;
-	mov.f32 	%f26, %f1;
-	mov.f32 	%f27, %f1;
-	mov.f32 	%f28, %f1;
-	mov.f32 	%f29, %f1;
-	bra.uni 	BB0_7;
-
-BB0_6:
-	ld.shared.v4.f32 	{%f26, %f27, %f28, %f29}, [%r2];
-
-BB0_7:
-	st.global.v2.f32 	[%r8], {%f26, %f27};
-	st.global.v2.f32 	[%r8+4255744], {%f28, %f29};
-
-BB0_8:
-	bar.sync 	0;
-	add.s32 	%r60, %r60, 4096;
-	add.s32 	%r59, %r59, 64;
-	add.s32 	%r58, %r58, -1;
-	setp.ne.s32 	%p6, %r58, 0;
-	@%p6 bra 	BB0_1;
-
-	ret;
-}
-
-
diff --git a/RTCP/GPUProc/src/UHEP/Trigger.cl-0.ptx b/RTCP/GPUProc/src/UHEP/Trigger.cl-0.ptx
deleted file mode 100644
index ae2713f9037df945f18a988f001c6c0d0abbf29b..0000000000000000000000000000000000000000
--- a/RTCP/GPUProc/src/UHEP/Trigger.cl-0.ptx
+++ /dev/null
@@ -1,511 +0,0 @@
-//
-// Generated by NVIDIA NVVM Compiler
-// Compiler built on Tue Feb  7 07:15:59 2012 (1328595359)
-// Driver 295.20
-//
-
-.version 3.0
-.target sm_21, texmode_independent
-.address_size 32
-
-.extern .shared .align 64 .b8 shr_4_tmp[16384];
-
-.entry trigger(
-	.param .u32 .ptr .global .align 1 trigger_param_0,
-	.param .u32 .ptr .global .align 4 trigger_param_1
-)
-{
-	.reg .f32 	%f<434>;
-	.reg .pred 	%p<21>;
-	.reg .s32 	%r<71>;
-
-
-	ld.param.u32 	%r31, [trigger_param_1];
-	// inline asm
-	mov.u32 	%r23, %tid.x;
-	// inline asm
-	// inline asm
-	mov.u32 	%r24, %tid.y;
-	// inline asm
-	shl.b32 	%r32, %r24, 4;
-	// inline asm
-	mov.u32 	%r25, %envreg5;
-	// inline asm
-	// inline asm
-	mov.u32 	%r26, %ntid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r27, %ctaid.z;
-	// inline asm
-	// inline asm
-	mov.u32 	%r28, %tid.z;
-	// inline asm
-	add.s32 	%r33, %r28, %r25;
-	mad.lo.s32 	%r3, %r27, %r26, %r33;
-	shl.b32 	%r34, %r24, 10;
-	mov.u32 	%r35, shr_4_tmp;
-	add.s32 	%r36, %r35, %r34;
-	shl.b32 	%r37, %r23, 6;
-	add.s32 	%r4, %r36, %r37;
-	add.s32 	%r2, %r32, %r23;
-	shl.b32 	%r38, %r2, 12;
-	and.b32  	%r5, %r38, 16773120;
-	shl.b32 	%r39, %r3, 21;
-	add.s32 	%r40, %r23, %r39;
-	shl.b32 	%r41, %r24, 12;
-	add.s32 	%r42, %r40, %r41;
-	shl.b32 	%r43, %r42, 2;
-	add.s32 	%r70, %r31, %r43;
-	shl.b32 	%r44, %r24, 6;
-	shl.b32 	%r45, %r23, 2;
-	add.s32 	%r46, %r35, %r44;
-	add.s32 	%r47, %r46, %r45;
-	add.s32 	%r7, %r47, 3072;
-	mov.f32 	%f420, 0f00000000;
-	mov.f32 	%f430, %f420;
-	mov.f32 	%f431, %f420;
-	mov.f32 	%f432, %f420;
-	mov.f32 	%f433, %f420;
-	mov.f32 	%f422, %f405;
-	mov.f32 	%f419, %f420;
-	mov.f32 	%f421, %f420;
-	mov.f32 	%f418, %f420;
-	mov.u32 	%r65, 0;
-	mov.u32 	%r64, %r65;
-	mov.f32 	%f426, %f420;
-	mov.f32 	%f427, %f420;
-	mov.f32 	%f428, %f420;
-	mov.f32 	%f429, %f420;
-	mov.f32 	%f423, %f420;
-	mov.f32 	%f424, %f420;
-	mov.f32 	%f425, %f420;
-
-BB0_1:
-	mov.u32 	%r68, %r70;
-	mov.u32 	%r8, %r68;
-	shl.b32 	%r49, %r64, 4;
-	add.s32 	%r12, %r5, %r49;
-	mov.u32 	%r67, 16;
-	mov.u32 	%r66, %r7;
-	mov.u32 	%r69, %r8;
-
-BB0_2:
-	mov.u32 	%r15, %r69;
-	ld.global.f32 	%f40, [%r15];
-	ld.global.f32 	%f41, [%r15+4194304];
-	mul.ftz.f32 	%f42, %f41, %f41;
-	fma.rn.ftz.f32 	%f43, %f40, %f40, %f42;
-	st.shared.f32 	[%r66+-3072], %f43;
-	sub.ftz.f32 	%f44, %f43, %f419;
-	add.ftz.f32 	%f45, %f421, 0f3F800000;
-	div.approx.ftz.f32 	%f46, %f44, %f45;
-	add.ftz.f32 	%f47, %f419, %f46;
-	sub.ftz.f32 	%f48, %f43, %f47;
-	fma.rn.ftz.f32 	%f49, %f44, %f48, %f420;
-	ld.global.f32 	%f50, [%r15+262144];
-	ld.global.f32 	%f51, [%r15+4456448];
-	mul.ftz.f32 	%f52, %f51, %f51;
-	fma.rn.ftz.f32 	%f53, %f50, %f50, %f52;
-	st.shared.f32 	[%r66+-2048], %f53;
-	add.ftz.f32 	%f54, %f45, 0f3F800000;
-	sub.ftz.f32 	%f55, %f53, %f47;
-	div.approx.ftz.f32 	%f56, %f55, %f54;
-	add.ftz.f32 	%f57, %f47, %f56;
-	sub.ftz.f32 	%f58, %f53, %f57;
-	fma.rn.ftz.f32 	%f59, %f55, %f58, %f49;
-	ld.global.f32 	%f60, [%r15+524288];
-	ld.global.f32 	%f61, [%r15+4718592];
-	mul.ftz.f32 	%f62, %f61, %f61;
-	fma.rn.ftz.f32 	%f63, %f60, %f60, %f62;
-	st.shared.f32 	[%r66+-1024], %f63;
-	add.ftz.f32 	%f64, %f54, 0f3F800000;
-	sub.ftz.f32 	%f65, %f63, %f57;
-	div.approx.ftz.f32 	%f66, %f65, %f64;
-	add.ftz.f32 	%f67, %f57, %f66;
-	sub.ftz.f32 	%f68, %f63, %f67;
-	fma.rn.ftz.f32 	%f69, %f65, %f68, %f59;
-	ld.global.f32 	%f70, [%r15+786432];
-	ld.global.f32 	%f71, [%r15+4980736];
-	mul.ftz.f32 	%f72, %f71, %f71;
-	fma.rn.ftz.f32 	%f73, %f70, %f70, %f72;
-	st.shared.f32 	[%r66], %f73;
-	add.ftz.f32 	%f421, %f64, 0f3F800000;
-	sub.ftz.f32 	%f74, %f73, %f67;
-	div.approx.ftz.f32 	%f75, %f74, %f421;
-	add.ftz.f32 	%f419, %f67, %f75;
-	sub.ftz.f32 	%f76, %f73, %f419;
-	fma.rn.ftz.f32 	%f420, %f74, %f76, %f69;
-	add.s32 	%r16, %r15, 1048576;
-	add.s32 	%r66, %r66, 4096;
-	add.s32 	%r67, %r67, -4;
-	setp.ne.s32 	%p1, %r67, 0;
-	mov.u32 	%r69, %r16;
-	@%p1 bra 	BB0_2;
-
-	bar.sync 	0;
-	ld.shared.v4.f32 	{%f258, %f259, %f260, %f261}, [%r4];
-	add.ftz.f32 	%f79, %f422, %f258;
-	sub.ftz.f32 	%f81, %f79, %f423;
-	add.ftz.f32 	%f83, %f81, %f259;
-	sub.ftz.f32 	%f85, %f83, %f424;
-	add.ftz.f32 	%f87, %f85, %f260;
-	sub.ftz.f32 	%f89, %f87, %f425;
-	add.ftz.f32 	%f91, %f89, %f261;
-	sub.ftz.f32 	%f93, %f91, %f426;
-	ld.shared.v4.f32 	{%f274, %f423, %f424, %f425}, [%r4+16];
-	add.ftz.f32 	%f95, %f93, %f274;
-	sub.ftz.f32 	%f97, %f95, %f427;
-	add.ftz.f32 	%f99, %f97, %f423;
-	sub.ftz.f32 	%f101, %f99, %f428;
-	add.ftz.f32 	%f103, %f101, %f424;
-	sub.ftz.f32 	%f105, %f103, %f429;
-	add.ftz.f32 	%f107, %f105, %f425;
-	sub.ftz.f32 	%f109, %f107, %f430;
-	ld.shared.v4.f32 	{%f426, %f427, %f428, %f429}, [%r4+32];
-	add.ftz.f32 	%f111, %f109, %f426;
-	sub.ftz.f32 	%f113, %f111, %f431;
-	add.ftz.f32 	%f115, %f113, %f427;
-	sub.ftz.f32 	%f117, %f115, %f432;
-	add.ftz.f32 	%f119, %f117, %f428;
-	sub.ftz.f32 	%f121, %f119, %f433;
-	add.ftz.f32 	%f123, %f121, %f429;
-	sub.ftz.f32 	%f124, %f123, %f258;
-	ld.shared.v4.f32 	{%f430, %f431, %f432, %f433}, [%r4+48];
-	add.ftz.f32 	%f126, %f124, %f430;
-	sub.ftz.f32 	%f127, %f126, %f259;
-	add.ftz.f32 	%f129, %f127, %f431;
-	sub.ftz.f32 	%f130, %f129, %f260;
-	add.ftz.f32 	%f132, %f130, %f432;
-	sub.ftz.f32 	%f133, %f132, %f261;
-	add.ftz.f32 	%f135, %f133, %f433;
-	sub.ftz.f32 	%f136, %f135, %f274;
-	max.f32 	%f137, %f81, %f85;
-	max.f32 	%f138, %f89, %f93;
-	max.f32 	%f139, %f137, %f138;
-	max.f32 	%f140, %f97, %f101;
-	max.f32 	%f141, %f105, %f109;
-	max.f32 	%f142, %f140, %f141;
-	max.f32 	%f143, %f113, %f117;
-	max.f32 	%f144, %f121, %f124;
-	max.f32 	%f145, %f143, %f144;
-	max.f32 	%f146, %f127, %f130;
-	max.f32 	%f147, %f133, %f136;
-	max.f32 	%f148, %f146, %f147;
-	max.f32 	%f149, %f139, %f142;
-	max.f32 	%f150, %f145, %f148;
-	max.f32 	%f151, %f149, %f150;
-	setp.ltu.ftz.f32 	%p2, %f151, %f418;
-	selp.f32 	%f418, %f418, %f151, %p2;
-	selp.b32 	%r65, %r65, %r12, %p2;
-	mov.f32 	%f306, %f81;
-	mov.f32 	%f307, %f85;
-	mov.f32 	%f308, %f89;
-	mov.f32 	%f309, %f93;
-	mov.f32 	%f326, %f97;
-	mov.f32 	%f327, %f101;
-	mov.f32 	%f328, %f105;
-	mov.f32 	%f329, %f109;
-	mov.f32 	%f346, %f113;
-	mov.f32 	%f347, %f117;
-	mov.f32 	%f348, %f121;
-	mov.f32 	%f349, %f124;
-	mov.f32 	%f362, %f127;
-	mov.f32 	%f363, %f130;
-	mov.f32 	%f364, %f133;
-	mov.f32 	%f422, %f136;
-	bar.sync 	0;
-	add.s32 	%r64, %r64, 1;
-	add.s32 	%r21, %r8, 64;
-	setp.ne.s32 	%p3, %r64, 256;
-	mov.u32 	%r70, %r21;
-	@%p3 bra 	BB0_1;
-
-	shl.b32 	%r50, %r2, 2;
-	add.s32 	%r22, %r35, %r50;
-	st.shared.f32 	[%r22], %f419;
-	st.shared.f32 	[%r22+1024], %f420;
-	st.shared.f32 	[%r22+2048], %f418;
-	st.shared.u32 	[%r22+3072], %r65;
-	setp.lt.u32 	%p4, %r2, 128;
-	@%p4 bra 	BB0_5;
-	bra.uni 	BB0_7;
-
-BB0_5:
-	ld.shared.f32 	%f152, [%r22+1536];
-	ld.shared.f32 	%f153, [%r22+512];
-	sub.ftz.f32 	%f154, %f153, %f419;
-	add.ftz.f32 	%f155, %f419, %f153;
-	div.rn.ftz.f32 	%f156, %f155, 0f40000000;
-	st.shared.f32 	[%r22], %f156;
-	add.ftz.f32 	%f157, %f420, %f152;
-	mul.ftz.f32 	%f158, %f154, %f154;
-	mul.ftz.f32 	%f159, %f158, %f421;
-	div.rn.ftz.f32 	%f160, %f159, 0f40000000;
-	add.ftz.f32 	%f161, %f157, %f160;
-	st.shared.f32 	[%r22+1024], %f161;
-	add.ftz.f32 	%f421, %f421, %f421;
-	ld.shared.f32 	%f13, [%r22+2560];
-	setp.lt.ftz.f32 	%p5, %f418, %f13;
-	@%p5 bra 	BB0_6;
-	bra.uni 	BB0_7;
-
-BB0_6:
-	st.shared.f32 	[%r22+2048], %f13;
-	ld.shared.u32 	%r52, [%r22+3584];
-	st.shared.u32 	[%r22+3072], %r52;
-
-BB0_7:
-	bar.sync 	0;
-	setp.lt.u32 	%p6, %r2, 64;
-	@%p6 bra 	BB0_8;
-	bra.uni 	BB0_10;
-
-BB0_8:
-	ld.shared.f32 	%f162, [%r22];
-	ld.shared.f32 	%f163, [%r22+1024];
-	ld.shared.f32 	%f164, [%r22+1280];
-	ld.shared.f32 	%f165, [%r22+256];
-	sub.ftz.f32 	%f166, %f165, %f162;
-	add.ftz.f32 	%f167, %f162, %f165;
-	div.rn.ftz.f32 	%f168, %f167, 0f40000000;
-	st.shared.f32 	[%r22], %f168;
-	add.ftz.f32 	%f169, %f163, %f164;
-	mul.ftz.f32 	%f170, %f166, %f166;
-	mul.ftz.f32 	%f171, %f170, %f421;
-	div.rn.ftz.f32 	%f172, %f171, 0f40000000;
-	add.ftz.f32 	%f173, %f169, %f172;
-	st.shared.f32 	[%r22+1024], %f173;
-	add.ftz.f32 	%f421, %f421, %f421;
-	ld.shared.f32 	%f16, [%r22+2304];
-	ld.shared.f32 	%f174, [%r22+2048];
-	setp.lt.ftz.f32 	%p7, %f174, %f16;
-	@%p7 bra 	BB0_9;
-	bra.uni 	BB0_10;
-
-BB0_9:
-	st.shared.f32 	[%r22+2048], %f16;
-	ld.shared.u32 	%r53, [%r22+3328];
-	st.shared.u32 	[%r22+3072], %r53;
-
-BB0_10:
-	bar.sync 	0;
-	setp.lt.u32 	%p8, %r2, 32;
-	@%p8 bra 	BB0_11;
-	bra.uni 	BB0_13;
-
-BB0_11:
-	ld.shared.f32 	%f175, [%r22];
-	ld.shared.f32 	%f176, [%r22+1024];
-	ld.shared.f32 	%f177, [%r22+1152];
-	ld.shared.f32 	%f178, [%r22+128];
-	sub.ftz.f32 	%f179, %f178, %f175;
-	add.ftz.f32 	%f180, %f175, %f178;
-	div.rn.ftz.f32 	%f181, %f180, 0f40000000;
-	st.shared.f32 	[%r22], %f181;
-	add.ftz.f32 	%f182, %f176, %f177;
-	mul.ftz.f32 	%f183, %f179, %f179;
-	mul.ftz.f32 	%f184, %f183, %f421;
-	div.rn.ftz.f32 	%f185, %f184, 0f40000000;
-	add.ftz.f32 	%f186, %f182, %f185;
-	st.shared.f32 	[%r22+1024], %f186;
-	add.ftz.f32 	%f421, %f421, %f421;
-	ld.shared.f32 	%f19, [%r22+2176];
-	ld.shared.f32 	%f187, [%r22+2048];
-	setp.lt.ftz.f32 	%p9, %f187, %f19;
-	@%p9 bra 	BB0_12;
-	bra.uni 	BB0_13;
-
-BB0_12:
-	st.shared.f32 	[%r22+2048], %f19;
-	ld.shared.u32 	%r54, [%r22+3200];
-	st.shared.u32 	[%r22+3072], %r54;
-
-BB0_13:
-	bar.sync 	0;
-	setp.lt.u32 	%p10, %r2, 16;
-	@%p10 bra 	BB0_14;
-	bra.uni 	BB0_16;
-
-BB0_14:
-	ld.shared.f32 	%f188, [%r22];
-	ld.shared.f32 	%f189, [%r22+1024];
-	ld.shared.f32 	%f190, [%r22+1088];
-	ld.shared.f32 	%f191, [%r22+64];
-	sub.ftz.f32 	%f192, %f191, %f188;
-	add.ftz.f32 	%f193, %f188, %f191;
-	div.rn.ftz.f32 	%f194, %f193, 0f40000000;
-	st.shared.f32 	[%r22], %f194;
-	add.ftz.f32 	%f195, %f189, %f190;
-	mul.ftz.f32 	%f196, %f192, %f192;
-	mul.ftz.f32 	%f197, %f196, %f421;
-	div.rn.ftz.f32 	%f198, %f197, 0f40000000;
-	add.ftz.f32 	%f199, %f195, %f198;
-	st.shared.f32 	[%r22+1024], %f199;
-	add.ftz.f32 	%f421, %f421, %f421;
-	ld.shared.f32 	%f22, [%r22+2112];
-	ld.shared.f32 	%f200, [%r22+2048];
-	setp.lt.ftz.f32 	%p11, %f200, %f22;
-	@%p11 bra 	BB0_15;
-	bra.uni 	BB0_16;
-
-BB0_15:
-	st.shared.f32 	[%r22+2048], %f22;
-	ld.shared.u32 	%r55, [%r22+3136];
-	st.shared.u32 	[%r22+3072], %r55;
-
-BB0_16:
-	bar.sync 	0;
-	setp.lt.u32 	%p12, %r2, 8;
-	@%p12 bra 	BB0_17;
-	bra.uni 	BB0_19;
-
-BB0_17:
-	ld.shared.f32 	%f201, [%r22];
-	ld.shared.f32 	%f202, [%r22+1024];
-	ld.shared.f32 	%f203, [%r22+1056];
-	ld.shared.f32 	%f204, [%r22+32];
-	sub.ftz.f32 	%f205, %f204, %f201;
-	add.ftz.f32 	%f206, %f201, %f204;
-	div.rn.ftz.f32 	%f207, %f206, 0f40000000;
-	st.shared.f32 	[%r22], %f207;
-	add.ftz.f32 	%f208, %f202, %f203;
-	mul.ftz.f32 	%f209, %f205, %f205;
-	mul.ftz.f32 	%f210, %f209, %f421;
-	div.rn.ftz.f32 	%f211, %f210, 0f40000000;
-	add.ftz.f32 	%f212, %f208, %f211;
-	st.shared.f32 	[%r22+1024], %f212;
-	add.ftz.f32 	%f421, %f421, %f421;
-	ld.shared.f32 	%f25, [%r22+2080];
-	ld.shared.f32 	%f213, [%r22+2048];
-	setp.lt.ftz.f32 	%p13, %f213, %f25;
-	@%p13 bra 	BB0_18;
-	bra.uni 	BB0_19;
-
-BB0_18:
-	st.shared.f32 	[%r22+2048], %f25;
-	ld.shared.u32 	%r56, [%r22+3104];
-	st.shared.u32 	[%r22+3072], %r56;
-
-BB0_19:
-	bar.sync 	0;
-	setp.lt.u32 	%p14, %r2, 4;
-	@%p14 bra 	BB0_20;
-	bra.uni 	BB0_22;
-
-BB0_20:
-	ld.shared.f32 	%f214, [%r22];
-	ld.shared.f32 	%f215, [%r22+1024];
-	ld.shared.f32 	%f216, [%r22+1040];
-	ld.shared.f32 	%f217, [%r22+16];
-	sub.ftz.f32 	%f218, %f217, %f214;
-	add.ftz.f32 	%f219, %f214, %f217;
-	div.rn.ftz.f32 	%f220, %f219, 0f40000000;
-	st.shared.f32 	[%r22], %f220;
-	add.ftz.f32 	%f221, %f215, %f216;
-	mul.ftz.f32 	%f222, %f218, %f218;
-	mul.ftz.f32 	%f223, %f222, %f421;
-	div.rn.ftz.f32 	%f224, %f223, 0f40000000;
-	add.ftz.f32 	%f225, %f221, %f224;
-	st.shared.f32 	[%r22+1024], %f225;
-	add.ftz.f32 	%f421, %f421, %f421;
-	ld.shared.f32 	%f28, [%r22+2064];
-	ld.shared.f32 	%f226, [%r22+2048];
-	setp.lt.ftz.f32 	%p15, %f226, %f28;
-	@%p15 bra 	BB0_21;
-	bra.uni 	BB0_22;
-
-BB0_21:
-	st.shared.f32 	[%r22+2048], %f28;
-	ld.shared.u32 	%r57, [%r22+3088];
-	st.shared.u32 	[%r22+3072], %r57;
-
-BB0_22:
-	bar.sync 	0;
-	setp.lt.u32 	%p16, %r2, 2;
-	@%p16 bra 	BB0_23;
-	bra.uni 	BB0_25;
-
-BB0_23:
-	ld.shared.f32 	%f227, [%r22];
-	ld.shared.f32 	%f228, [%r22+1024];
-	ld.shared.f32 	%f229, [%r22+1032];
-	ld.shared.f32 	%f230, [%r22+8];
-	sub.ftz.f32 	%f231, %f230, %f227;
-	add.ftz.f32 	%f232, %f227, %f230;
-	div.rn.ftz.f32 	%f233, %f232, 0f40000000;
-	st.shared.f32 	[%r22], %f233;
-	add.ftz.f32 	%f234, %f228, %f229;
-	mul.ftz.f32 	%f235, %f231, %f231;
-	mul.ftz.f32 	%f236, %f235, %f421;
-	div.rn.ftz.f32 	%f237, %f236, 0f40000000;
-	add.ftz.f32 	%f238, %f234, %f237;
-	st.shared.f32 	[%r22+1024], %f238;
-	add.ftz.f32 	%f421, %f421, %f421;
-	ld.shared.f32 	%f31, [%r22+2056];
-	ld.shared.f32 	%f239, [%r22+2048];
-	setp.lt.ftz.f32 	%p17, %f239, %f31;
-	@%p17 bra 	BB0_24;
-	bra.uni 	BB0_25;
-
-BB0_24:
-	st.shared.f32 	[%r22+2048], %f31;
-	ld.shared.u32 	%r58, [%r22+3080];
-	st.shared.u32 	[%r22+3072], %r58;
-
-BB0_25:
-	bar.sync 	0;
-	setp.eq.s32 	%p18, %r2, 0;
-	@%p18 bra 	BB0_26;
-	bra.uni 	BB0_28;
-
-BB0_26:
-	ld.shared.f32 	%f240, [%r22];
-	ld.shared.f32 	%f241, [%r22+1024];
-	ld.shared.f32 	%f242, [%r22+1028];
-	ld.shared.f32 	%f243, [%r22+4];
-	sub.ftz.f32 	%f244, %f243, %f240;
-	add.ftz.f32 	%f245, %f240, %f243;
-	div.rn.ftz.f32 	%f246, %f245, 0f40000000;
-	st.shared.f32 	[%r22], %f246;
-	add.ftz.f32 	%f247, %f241, %f242;
-	mul.ftz.f32 	%f248, %f244, %f244;
-	mul.ftz.f32 	%f249, %f248, %f421;
-	div.rn.ftz.f32 	%f250, %f249, 0f40000000;
-	add.ftz.f32 	%f251, %f247, %f250;
-	st.shared.f32 	[%r22+1024], %f251;
-	add.ftz.f32 	%f421, %f421, %f421;
-	ld.shared.f32 	%f34, [%r22+2052];
-	ld.shared.f32 	%f252, [%r22+2048];
-	setp.lt.ftz.f32 	%p19, %f252, %f34;
-	@%p19 bra 	BB0_27;
-	bra.uni 	BB0_28;
-
-BB0_27:
-	st.shared.f32 	[%r22+2048], %f34;
-	ld.shared.u32 	%r59, [%r22+3076];
-	st.shared.u32 	[%r22+3072], %r59;
-
-BB0_28:
-	bar.sync 	0;
-	@%p18 bra 	BB0_30;
-
-	ret;
-
-BB0_30:
-	shl.b32 	%r60, %r3, 4;
-	ld.param.u32 	%r63, [trigger_param_0];
-	add.s32 	%r61, %r63, %r60;
-	ld.shared.f32 	%f253, [shr_4_tmp];
-	st.global.f32 	[%r61], %f253;
-	add.ftz.f32 	%f254, %f421, 0fBF800000;
-	ld.shared.f32 	%f255, [shr_4_tmp+1024];
-	div.approx.ftz.f32 	%f256, %f255, %f254;
-	st.global.f32 	[%r61+4], %f256;
-	ld.shared.f32 	%f257, [shr_4_tmp+2048];
-	st.global.f32 	[%r61+8], %f257;
-	ld.shared.u32 	%r62, [shr_4_tmp+3072];
-	st.global.u32 	[%r61+12], %r62;
-	ret;
-}
-
-
diff --git a/RTCP/GPUProc/src/octave-core b/RTCP/GPUProc/src/octave-core
deleted file mode 100644
index 608aba01c896492a0c4befa1f98dc1416cc6cf6c..0000000000000000000000000000000000000000
Binary files a/RTCP/GPUProc/src/octave-core and /dev/null differ