diff --git a/.gitattributes b/.gitattributes index 487a7678f0b1a64ca20d44a865dec0842d1bc3c6..48ca44cd2d84f678ccd26442d82be637eb783cc6 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3657,7 +3657,6 @@ RTCP/GPUProc/src/Align.h -text RTCP/GPUProc/src/BandPass.cc -text RTCP/GPUProc/src/BandPass.h -text RTCP/GPUProc/src/BeamFormer/BeamFormer.cl -text -RTCP/GPUProc/src/BeamFormer/BeamFormer.cl-0.ptx -text RTCP/GPUProc/src/BeamFormer/BeamFormer.cl.4x3 -text RTCP/GPUProc/src/BeamFormer/BeamFormer.cl.6x3 -text RTCP/GPUProc/src/BeamFormer/BeamFormer.cl.bak -text @@ -3665,16 +3664,11 @@ RTCP/GPUProc/src/BeamFormer/BeamFormer.cl.not -text RTCP/GPUProc/src/BeamFormer/BeamFormer.cl.ok -text RTCP/GPUProc/src/BeamFormer/BeamFormer.cl.orig -text RTCP/GPUProc/src/BeamFormer/CoherentStokes.cl -text -RTCP/GPUProc/src/BeamFormer/CoherentStokes.cl-0.ptx -text RTCP/GPUProc/src/BeamFormer/CoherentStokes.cl.ok -text RTCP/GPUProc/src/BeamFormer/Dedispersion.cl -text -RTCP/GPUProc/src/BeamFormer/Dedispersion.cl-0.ptx -text RTCP/GPUProc/src/BeamFormer/IncoherentStokes.cl -text -RTCP/GPUProc/src/BeamFormer/IncoherentStokes.cl-0.ptx -text RTCP/GPUProc/src/BeamFormer/IntToFloat.cl -text -RTCP/GPUProc/src/BeamFormer/IntToFloat.cl-0.ptx -text RTCP/GPUProc/src/BeamFormer/Transpose.cl -text -RTCP/GPUProc/src/BeamFormer/Transpose.cl-0.ptx -text RTCP/GPUProc/src/BeamletBuffer.cc -text RTCP/GPUProc/src/BeamletBuffer.h -text RTCP/GPUProc/src/BeamletBufferToComputeNode.cc -text @@ -3682,14 +3676,11 @@ RTCP/GPUProc/src/BeamletBufferToComputeNode.h -text RTCP/GPUProc/src/CL/cl.hpp -text RTCP/GPUProc/src/CMakeLists.txt -text RTCP/GPUProc/src/Correlator.cl -text -RTCP/GPUProc/src/Correlator.cl-0.ptx -text RTCP/GPUProc/src/DelayAndBandPass.cl -text -RTCP/GPUProc/src/DelayAndBandPass.cl-0.ptx -text RTCP/GPUProc/src/Delays.cc -text RTCP/GPUProc/src/Delays.h -text RTCP/GPUProc/src/FFT.cl -text RTCP/GPUProc/src/FIR.cl -text -RTCP/GPUProc/src/FIR.cl-0.ptx -text RTCP/GPUProc/src/FilterBank.cc -text RTCP/GPUProc/src/FilterBank.h -text RTCP/GPUProc/src/InputSection.cc -text @@ -3700,7 +3691,6 @@ RTCP/GPUProc/src/LockedRanges.h -text RTCP/GPUProc/src/LogThread.cc -text RTCP/GPUProc/src/LogThread.h -text RTCP/GPUProc/src/NewCorrelator.cl -text -RTCP/GPUProc/src/NewCorrelator.cl-0.ptx -text RTCP/GPUProc/src/OpenCL_Support.cc -text RTCP/GPUProc/src/OpenCL_Support.h -text RTCP/GPUProc/src/OpenMP_Support.h -text @@ -3714,29 +3704,22 @@ RTCP/GPUProc/src/Scheduling.cc -text RTCP/GPUProc/src/Scheduling.h -text RTCP/GPUProc/src/SlidingPointer.h -text RTCP/GPUProc/src/UHEP/BeamFormer.cl -text -RTCP/GPUProc/src/UHEP/BeamFormer.cl-0.ptx -text RTCP/GPUProc/src/UHEP/BeamFormer.cl.4groups -text RTCP/GPUProc/src/UHEP/BeamFormer.cl.hop -text -RTCP/GPUProc/src/UHEP/BeamFormer.cl.hop-0.ptx -text RTCP/GPUProc/src/UHEP/BeamFormer.cl.not -text RTCP/GPUProc/src/UHEP/BeamFormer.cl.ok -text RTCP/GPUProc/src/UHEP/InvFFT.cl -text -RTCP/GPUProc/src/UHEP/InvFFT.cl-0.ptx -text RTCP/GPUProc/src/UHEP/InvFIR.cl -text -RTCP/GPUProc/src/UHEP/InvFIR.cl-0.ptx -text RTCP/GPUProc/src/UHEP/InvertedStationPPFWeights.cc -text RTCP/GPUProc/src/UHEP/InvertedStationPPFWeights.h -text RTCP/GPUProc/src/UHEP/Transpose.cl -text -RTCP/GPUProc/src/UHEP/Transpose.cl-0.ptx -text RTCP/GPUProc/src/UHEP/Transpose.cl.ok -text RTCP/GPUProc/src/UHEP/Trigger.cl -text -RTCP/GPUProc/src/UHEP/Trigger.cl-0.ptx -text RTCP/GPUProc/src/UHEP/Trigger.cl.8 -text RTCP/GPUProc/src/UHEP/Trigger.cl.ok -text RTCP/GPUProc/src/WallClockTime.h -text RTCP/GPUProc/src/fft2.cl -text RTCP/GPUProc/src/math.cl -text -RTCP/GPUProc/src/octave-core -text RTCP/GPUProc/test/77_Stations.parset -text RTCP/GPUProc/test/AARTFAAC.parset -text RTCP/GPUProc/test/CMakeLists.txt -text diff --git a/RTCP/GPUProc/src/BeamFormer/BeamFormer.cl-0.ptx b/RTCP/GPUProc/src/BeamFormer/BeamFormer.cl-0.ptx deleted file mode 100644 index 7f1d211419f4937459be9ecf7a4c5dbc50a45446..0000000000000000000000000000000000000000 --- a/RTCP/GPUProc/src/BeamFormer/BeamFormer.cl-0.ptx +++ /dev/null @@ -1,607 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Sat Sep 29 10:12:13 2012 (1348906333) -// Driver 304.54 -// - -.version 3.0 -.target sm_30, texmode_independent -.address_size 32 - -.extern .shared .align 16 .b8 shr_4__local[4096]; - -.entry complexVoltages( - .param .u32 .ptr .global .align 1 complexVoltages_param_0, - .param .u32 .ptr .global .align 1 complexVoltages_param_1, - .param .u32 .ptr .global .align 1 complexVoltages_param_2 -) -{ - .reg .f32 %f<1124>; - .reg .pred %p<13>; - .reg .s32 %r<206>; - - - ld.param.u32 %r1, [complexVoltages_param_0]; - ld.param.u32 %r3, [complexVoltages_param_2]; - // inline asm - mov.u32 %r80, %tid.x; - // inline asm - // inline asm - mov.u32 %r81, %tid.y; - // inline asm - // inline asm - mov.u32 %r82, %envreg5; - // inline asm - // inline asm - mov.u32 %r83, %ntid.z; - // inline asm - // inline asm - mov.u32 %r84, %ctaid.z; - // inline asm - // inline asm - mov.u32 %r85, %tid.z; - // inline asm - add.s32 %r87, %r85, %r82; - mad.lo.s32 %r10, %r84, %r83, %r87; - shl.b32 %r88, %r10, 10; - shl.b32 %r89, %r81, 3; - add.s32 %r90, %r88, %r89; - add.s32 %r91, %r3, %r90; - ld.global.v2.f32 {%f1028, %f1029}, [%r91]; - ld.global.v2.f32 {%f1034, %f1035}, [%r91+2097152]; - ld.global.v2.f32 {%f1040, %f1041}, [%r91+4194304]; - ld.global.v2.f32 {%f1046, %f1047}, [%r91+6291456]; - ld.global.v2.f32 {%f1052, %f1053}, [%r91+8388608]; - ld.global.v2.f32 {%f1058, %f1059}, [%r91+10485760]; - ld.global.v2.f32 {%f1064, %f1065}, [%r91+12582912]; - ld.global.v2.f32 {%f1070, %f1071}, [%r91+14680064]; - ld.global.v2.f32 {%f1076, %f1077}, [%r91+16777216]; - ld.global.v2.f32 {%f1082, %f1083}, [%r91+18874368]; - ld.global.v2.f32 {%f1088, %f1089}, [%r91+20971520]; - ld.global.v2.f32 {%f1094, %f1095}, [%r91+23068672]; - ld.global.v2.f32 {%f1100, %f1101}, [%r91+25165824]; - ld.global.v2.f32 {%f1106, %f1107}, [%r91+27262976]; - ld.global.v2.f32 {%f1112, %f1113}, [%r91+29360128]; - ld.global.v2.f32 {%f1118, %f1119}, [%r91+31457280]; - shl.b32 %r92, %r10, 16; - shl.b32 %r93, %r81, 4; - add.s32 %r94, %r92, %r93; - shl.b32 %r95, %r80, 3; - add.s32 %r96, %r94, %r95; - add.s32 %r189, %r1, %r96; - mov.u32 %r97, shr_4__local; - add.s32 %r98, %r97, %r95; - add.s32 %r12, %r98, 2048; - mov.u32 %r182, 0; - -BB0_1: - mov.u32 %r187, %r189; - mov.u32 %r13, %r187; - shl.b32 %r15, %r182, 4; - // inline asm - mov.u32 %r99, %tid.x; - // inline asm - // inline asm - mov.u32 %r100, %tid.y; - // inline asm - shl.b32 %r17, %r100, 1; - add.s32 %r184, %r17, %r99; - setp.gt.u32 %p1, %r184, 255; - @%p1 bra BB0_4; - - add.s32 %r19, %r99, %r17; - and.b32 %r102, %r19, 15; - add.s32 %r20, %r15, %r102; - mov.u32 %r183, 0; - -BB0_3: - mov.u32 %r22, %r184; - add.s32 %r103, %r19, %r183; - and.b32 %r104, %r103, 65520; - shl.b32 %r105, %r104, 16; - ld.param.u32 %r179, [complexVoltages_param_1]; - add.s32 %r106, %r179, %r105; - shl.b32 %r107, %r10, 9; - add.s32 %r108, %r106, %r107; - shl.b32 %r109, %r20, 4; - add.s32 %r110, %r108, %r109; - shl.b32 %r111, %r22, 4; - add.s32 %r113, %r97, %r111; - ld.global.v4.f32 {%f1024, %f1025, %f1026, %f1027}, [%r110]; - st.shared.v4.f32 [%r113], {%f1024, %f1025, %f1026, %f1027}; - add.s32 %r23, %r22, 256; - add.s32 %r183, %r183, 256; - setp.gt.u32 %p2, %r22, -257; - mov.u32 %r184, %r23; - @%p2 bra BB0_3; - -BB0_4: - mov.u32 %r186, %r12; - bar.sync 0; - mov.u32 %r185, 16; - mov.u32 %r188, %r13; - -BB0_5: - mov.u32 %r28, %r188; - ld.shared.v2.f32 {%f798, %f799}, [%r186+-2048]; - mov.f32 %f1, 0f00000000; - fma.rn.ftz.f32 %f802, %f1028, %f798, %f1; - fma.rn.ftz.f32 %f803, %f1028, %f799, %f1; - neg.ftz.f32 %f3, %f799; - fma.rn.ftz.f32 %f810, %f1029, %f3, %f802; - fma.rn.ftz.f32 %f811, %f1029, %f798, %f803; - ld.shared.v2.f32 {%f814, %f815}, [%r186+-1792]; - fma.rn.ftz.f32 %f816, %f1034, %f814, %f810; - fma.rn.ftz.f32 %f817, %f1034, %f815, %f811; - neg.ftz.f32 %f6, %f815; - fma.rn.ftz.f32 %f824, %f1035, %f6, %f816; - fma.rn.ftz.f32 %f825, %f1035, %f814, %f817; - ld.shared.v2.f32 {%f828, %f829}, [%r186+-1536]; - fma.rn.ftz.f32 %f830, %f1040, %f828, %f824; - fma.rn.ftz.f32 %f831, %f1040, %f829, %f825; - neg.ftz.f32 %f9, %f829; - fma.rn.ftz.f32 %f838, %f1041, %f9, %f830; - fma.rn.ftz.f32 %f839, %f1041, %f828, %f831; - ld.shared.v2.f32 {%f842, %f843}, [%r186+-1280]; - fma.rn.ftz.f32 %f844, %f1046, %f842, %f838; - fma.rn.ftz.f32 %f845, %f1046, %f843, %f839; - neg.ftz.f32 %f12, %f843; - fma.rn.ftz.f32 %f852, %f1047, %f12, %f844; - fma.rn.ftz.f32 %f853, %f1047, %f842, %f845; - ld.shared.v2.f32 {%f856, %f857}, [%r186+-1024]; - fma.rn.ftz.f32 %f858, %f1052, %f856, %f852; - fma.rn.ftz.f32 %f859, %f1052, %f857, %f853; - neg.ftz.f32 %f15, %f857; - fma.rn.ftz.f32 %f866, %f1053, %f15, %f858; - fma.rn.ftz.f32 %f867, %f1053, %f856, %f859; - ld.shared.v2.f32 {%f870, %f871}, [%r186+-768]; - fma.rn.ftz.f32 %f872, %f1058, %f870, %f866; - fma.rn.ftz.f32 %f873, %f1058, %f871, %f867; - neg.ftz.f32 %f18, %f871; - fma.rn.ftz.f32 %f880, %f1059, %f18, %f872; - fma.rn.ftz.f32 %f881, %f1059, %f870, %f873; - ld.shared.v2.f32 {%f884, %f885}, [%r186+-512]; - fma.rn.ftz.f32 %f886, %f1064, %f884, %f880; - fma.rn.ftz.f32 %f887, %f1064, %f885, %f881; - neg.ftz.f32 %f21, %f885; - fma.rn.ftz.f32 %f894, %f1065, %f21, %f886; - fma.rn.ftz.f32 %f895, %f1065, %f884, %f887; - ld.shared.v2.f32 {%f898, %f899}, [%r186+-256]; - fma.rn.ftz.f32 %f900, %f1070, %f898, %f894; - fma.rn.ftz.f32 %f901, %f1070, %f899, %f895; - neg.ftz.f32 %f24, %f899; - fma.rn.ftz.f32 %f908, %f1071, %f24, %f900; - fma.rn.ftz.f32 %f909, %f1071, %f898, %f901; - ld.shared.v2.f32 {%f912, %f913}, [%r186]; - fma.rn.ftz.f32 %f914, %f1076, %f912, %f908; - fma.rn.ftz.f32 %f915, %f1076, %f913, %f909; - neg.ftz.f32 %f27, %f913; - fma.rn.ftz.f32 %f922, %f1077, %f27, %f914; - fma.rn.ftz.f32 %f923, %f1077, %f912, %f915; - ld.shared.v2.f32 {%f926, %f927}, [%r186+256]; - fma.rn.ftz.f32 %f928, %f1082, %f926, %f922; - fma.rn.ftz.f32 %f929, %f1082, %f927, %f923; - neg.ftz.f32 %f30, %f927; - fma.rn.ftz.f32 %f936, %f1083, %f30, %f928; - fma.rn.ftz.f32 %f937, %f1083, %f926, %f929; - ld.shared.v2.f32 {%f940, %f941}, [%r186+512]; - fma.rn.ftz.f32 %f942, %f1088, %f940, %f936; - fma.rn.ftz.f32 %f943, %f1088, %f941, %f937; - neg.ftz.f32 %f33, %f941; - fma.rn.ftz.f32 %f950, %f1089, %f33, %f942; - fma.rn.ftz.f32 %f951, %f1089, %f940, %f943; - ld.shared.v2.f32 {%f954, %f955}, [%r186+768]; - fma.rn.ftz.f32 %f956, %f1094, %f954, %f950; - fma.rn.ftz.f32 %f957, %f1094, %f955, %f951; - neg.ftz.f32 %f36, %f955; - fma.rn.ftz.f32 %f964, %f1095, %f36, %f956; - fma.rn.ftz.f32 %f965, %f1095, %f954, %f957; - ld.shared.v2.f32 {%f968, %f969}, [%r186+1024]; - fma.rn.ftz.f32 %f970, %f1100, %f968, %f964; - fma.rn.ftz.f32 %f971, %f1100, %f969, %f965; - neg.ftz.f32 %f39, %f969; - fma.rn.ftz.f32 %f978, %f1101, %f39, %f970; - fma.rn.ftz.f32 %f979, %f1101, %f968, %f971; - ld.shared.v2.f32 {%f982, %f983}, [%r186+1280]; - fma.rn.ftz.f32 %f984, %f1106, %f982, %f978; - fma.rn.ftz.f32 %f985, %f1106, %f983, %f979; - neg.ftz.f32 %f42, %f983; - fma.rn.ftz.f32 %f992, %f1107, %f42, %f984; - fma.rn.ftz.f32 %f993, %f1107, %f982, %f985; - ld.shared.v2.f32 {%f996, %f997}, [%r186+1536]; - fma.rn.ftz.f32 %f998, %f1112, %f996, %f992; - fma.rn.ftz.f32 %f999, %f1112, %f997, %f993; - neg.ftz.f32 %f45, %f997; - fma.rn.ftz.f32 %f1006, %f1113, %f45, %f998; - fma.rn.ftz.f32 %f1007, %f1113, %f996, %f999; - ld.shared.v2.f32 {%f1010, %f1011}, [%r186+1792]; - fma.rn.ftz.f32 %f1012, %f1118, %f1010, %f1006; - fma.rn.ftz.f32 %f1013, %f1118, %f1011, %f1007; - neg.ftz.f32 %f48, %f1011; - fma.rn.ftz.f32 %f1020, %f1119, %f48, %f1012; - fma.rn.ftz.f32 %f1021, %f1119, %f1010, %f1013; - st.global.v2.f32 [%r28], {%f1020, %f1021}; - add.s32 %r29, %r28, 2048; - add.s32 %r186, %r186, 16; - add.s32 %r185, %r185, -1; - setp.ne.s32 %p3, %r185, 0; - mov.u32 %r188, %r29; - @%p3 bra BB0_5; - - bar.sync 0; - add.s32 %r182, %r182, 1; - add.s32 %r33, %r13, 32768; - setp.ne.s32 %p4, %r182, 2; - mov.u32 %r189, %r33; - @%p4 bra BB0_1; - - ld.param.u32 %r181, [complexVoltages_param_2]; - add.s32 %r119, %r181, %r90; - ld.global.v2.f32 {%f702, %f703}, [%r119+33554432]; - ld.global.v2.f32 {%f708, %f709}, [%r119+35651584]; - ld.global.v2.f32 {%f714, %f715}, [%r119+37748736]; - ld.global.v2.f32 {%f720, %f721}, [%r119+39845888]; - ld.global.v2.f32 {%f726, %f727}, [%r119+41943040]; - ld.global.v2.f32 {%f732, %f733}, [%r119+44040192]; - ld.global.v2.f32 {%f738, %f739}, [%r119+46137344]; - ld.global.v2.f32 {%f744, %f745}, [%r119+48234496]; - ld.global.v2.f32 {%f750, %f751}, [%r119+50331648]; - ld.global.v2.f32 {%f756, %f757}, [%r119+52428800]; - ld.global.v2.f32 {%f762, %f763}, [%r119+54525952]; - ld.global.v2.f32 {%f768, %f769}, [%r119+56623104]; - ld.global.v2.f32 {%f774, %f775}, [%r119+58720256]; - ld.global.v2.f32 {%f780, %f781}, [%r119+60817408]; - ld.global.v2.f32 {%f786, %f787}, [%r119+62914560]; - ld.global.v2.f32 {%f792, %f793}, [%r119+65011712]; - ld.param.u32 %r176, [complexVoltages_param_0]; - add.s32 %r197, %r176, %r96; - mov.u32 %r190, 0; - -BB0_8: - mov.u32 %r195, %r197; - mov.u32 %r36, %r195; - shl.b32 %r38, %r190, 4; - // inline asm - mov.u32 %r129, %tid.x; - // inline asm - // inline asm - mov.u32 %r130, %tid.y; - // inline asm - shl.b32 %r40, %r130, 1; - add.s32 %r192, %r40, %r129; - setp.gt.u32 %p5, %r192, 255; - @%p5 bra BB0_11; - - add.s32 %r42, %r129, %r40; - and.b32 %r132, %r42, 15; - add.s32 %r43, %r38, %r132; - mov.u32 %r191, 0; - -BB0_10: - mov.u32 %r45, %r192; - add.s32 %r133, %r42, %r191; - shl.b32 %r134, %r43, 4; - shl.b32 %r135, %r10, 9; - shl.b32 %r136, %r45, 4; - add.s32 %r138, %r97, %r136; - and.b32 %r139, %r133, 65520; - shl.b32 %r140, %r139, 16; - ld.param.u32 %r178, [complexVoltages_param_1]; - add.s32 %r141, %r140, %r178; - add.s32 %r142, %r141, %r135; - add.s32 %r143, %r142, %r134; - ld.global.v4.f32 {%f698, %f699, %f700, %f701}, [%r143+16777216]; - st.shared.v4.f32 [%r138], {%f698, %f699, %f700, %f701}; - add.s32 %r46, %r45, 256; - add.s32 %r191, %r191, 256; - setp.gt.u32 %p6, %r45, -257; - mov.u32 %r192, %r46; - @%p6 bra BB0_10; - -BB0_11: - mov.u32 %r194, %r12; - bar.sync 0; - mov.u32 %r193, 16; - mov.u32 %r196, %r36; - -BB0_12: - mov.u32 %r51, %r196; - ld.shared.v2.f32 {%f472, %f473}, [%r194+-2048]; - ld.global.v2.f32 {%f474, %f475}, [%r51]; - fma.rn.ftz.f32 %f476, %f702, %f472, %f474; - fma.rn.ftz.f32 %f477, %f702, %f473, %f475; - neg.ftz.f32 %f51, %f473; - fma.rn.ftz.f32 %f484, %f703, %f51, %f476; - fma.rn.ftz.f32 %f485, %f703, %f472, %f477; - ld.shared.v2.f32 {%f488, %f489}, [%r194+-1792]; - fma.rn.ftz.f32 %f490, %f708, %f488, %f484; - fma.rn.ftz.f32 %f491, %f708, %f489, %f485; - neg.ftz.f32 %f54, %f489; - fma.rn.ftz.f32 %f498, %f709, %f54, %f490; - fma.rn.ftz.f32 %f499, %f709, %f488, %f491; - ld.shared.v2.f32 {%f502, %f503}, [%r194+-1536]; - fma.rn.ftz.f32 %f504, %f714, %f502, %f498; - fma.rn.ftz.f32 %f505, %f714, %f503, %f499; - neg.ftz.f32 %f57, %f503; - fma.rn.ftz.f32 %f512, %f715, %f57, %f504; - fma.rn.ftz.f32 %f513, %f715, %f502, %f505; - ld.shared.v2.f32 {%f516, %f517}, [%r194+-1280]; - fma.rn.ftz.f32 %f518, %f720, %f516, %f512; - fma.rn.ftz.f32 %f519, %f720, %f517, %f513; - neg.ftz.f32 %f60, %f517; - fma.rn.ftz.f32 %f526, %f721, %f60, %f518; - fma.rn.ftz.f32 %f527, %f721, %f516, %f519; - ld.shared.v2.f32 {%f530, %f531}, [%r194+-1024]; - fma.rn.ftz.f32 %f532, %f726, %f530, %f526; - fma.rn.ftz.f32 %f533, %f726, %f531, %f527; - neg.ftz.f32 %f63, %f531; - fma.rn.ftz.f32 %f540, %f727, %f63, %f532; - fma.rn.ftz.f32 %f541, %f727, %f530, %f533; - ld.shared.v2.f32 {%f544, %f545}, [%r194+-768]; - fma.rn.ftz.f32 %f546, %f732, %f544, %f540; - fma.rn.ftz.f32 %f547, %f732, %f545, %f541; - neg.ftz.f32 %f66, %f545; - fma.rn.ftz.f32 %f554, %f733, %f66, %f546; - fma.rn.ftz.f32 %f555, %f733, %f544, %f547; - ld.shared.v2.f32 {%f558, %f559}, [%r194+-512]; - fma.rn.ftz.f32 %f560, %f738, %f558, %f554; - fma.rn.ftz.f32 %f561, %f738, %f559, %f555; - neg.ftz.f32 %f69, %f559; - fma.rn.ftz.f32 %f568, %f739, %f69, %f560; - fma.rn.ftz.f32 %f569, %f739, %f558, %f561; - ld.shared.v2.f32 {%f572, %f573}, [%r194+-256]; - fma.rn.ftz.f32 %f574, %f744, %f572, %f568; - fma.rn.ftz.f32 %f575, %f744, %f573, %f569; - neg.ftz.f32 %f72, %f573; - fma.rn.ftz.f32 %f582, %f745, %f72, %f574; - fma.rn.ftz.f32 %f583, %f745, %f572, %f575; - ld.shared.v2.f32 {%f586, %f587}, [%r194]; - fma.rn.ftz.f32 %f588, %f750, %f586, %f582; - fma.rn.ftz.f32 %f589, %f750, %f587, %f583; - neg.ftz.f32 %f75, %f587; - fma.rn.ftz.f32 %f596, %f751, %f75, %f588; - fma.rn.ftz.f32 %f597, %f751, %f586, %f589; - ld.shared.v2.f32 {%f600, %f601}, [%r194+256]; - fma.rn.ftz.f32 %f602, %f756, %f600, %f596; - fma.rn.ftz.f32 %f603, %f756, %f601, %f597; - neg.ftz.f32 %f78, %f601; - fma.rn.ftz.f32 %f610, %f757, %f78, %f602; - fma.rn.ftz.f32 %f611, %f757, %f600, %f603; - ld.shared.v2.f32 {%f614, %f615}, [%r194+512]; - fma.rn.ftz.f32 %f616, %f762, %f614, %f610; - fma.rn.ftz.f32 %f617, %f762, %f615, %f611; - neg.ftz.f32 %f81, %f615; - fma.rn.ftz.f32 %f624, %f763, %f81, %f616; - fma.rn.ftz.f32 %f625, %f763, %f614, %f617; - ld.shared.v2.f32 {%f628, %f629}, [%r194+768]; - fma.rn.ftz.f32 %f630, %f768, %f628, %f624; - fma.rn.ftz.f32 %f631, %f768, %f629, %f625; - neg.ftz.f32 %f84, %f629; - fma.rn.ftz.f32 %f638, %f769, %f84, %f630; - fma.rn.ftz.f32 %f639, %f769, %f628, %f631; - ld.shared.v2.f32 {%f642, %f643}, [%r194+1024]; - fma.rn.ftz.f32 %f644, %f774, %f642, %f638; - fma.rn.ftz.f32 %f645, %f774, %f643, %f639; - neg.ftz.f32 %f87, %f643; - fma.rn.ftz.f32 %f652, %f775, %f87, %f644; - fma.rn.ftz.f32 %f653, %f775, %f642, %f645; - ld.shared.v2.f32 {%f656, %f657}, [%r194+1280]; - fma.rn.ftz.f32 %f658, %f780, %f656, %f652; - fma.rn.ftz.f32 %f659, %f780, %f657, %f653; - neg.ftz.f32 %f90, %f657; - fma.rn.ftz.f32 %f666, %f781, %f90, %f658; - fma.rn.ftz.f32 %f667, %f781, %f656, %f659; - ld.shared.v2.f32 {%f670, %f671}, [%r194+1536]; - fma.rn.ftz.f32 %f672, %f786, %f670, %f666; - fma.rn.ftz.f32 %f673, %f786, %f671, %f667; - neg.ftz.f32 %f93, %f671; - fma.rn.ftz.f32 %f680, %f787, %f93, %f672; - fma.rn.ftz.f32 %f681, %f787, %f670, %f673; - ld.shared.v2.f32 {%f684, %f685}, [%r194+1792]; - fma.rn.ftz.f32 %f686, %f792, %f684, %f680; - fma.rn.ftz.f32 %f687, %f792, %f685, %f681; - neg.ftz.f32 %f96, %f685; - fma.rn.ftz.f32 %f694, %f793, %f96, %f686; - fma.rn.ftz.f32 %f695, %f793, %f684, %f687; - st.global.v2.f32 [%r51], {%f694, %f695}; - add.s32 %r52, %r51, 2048; - add.s32 %r194, %r194, 16; - add.s32 %r193, %r193, -1; - setp.ne.s32 %p7, %r193, 0; - mov.u32 %r196, %r52; - @%p7 bra BB0_12; - - bar.sync 0; - add.s32 %r190, %r190, 1; - add.s32 %r56, %r36, 32768; - setp.ne.s32 %p8, %r190, 2; - mov.u32 %r197, %r56; - @%p8 bra BB0_8; - - ld.param.u32 %r180, [complexVoltages_param_2]; - add.s32 %r149, %r180, %r90; - ld.global.v2.f32 {%f376, %f377}, [%r149+67108864]; - ld.global.v2.f32 {%f382, %f383}, [%r149+69206016]; - ld.global.v2.f32 {%f388, %f389}, [%r149+71303168]; - ld.global.v2.f32 {%f394, %f395}, [%r149+73400320]; - ld.global.v2.f32 {%f400, %f401}, [%r149+75497472]; - ld.global.v2.f32 {%f406, %f407}, [%r149+77594624]; - ld.global.v2.f32 {%f412, %f413}, [%r149+79691776]; - ld.global.v2.f32 {%f418, %f419}, [%r149+81788928]; - ld.global.v2.f32 {%f424, %f425}, [%r149+83886080]; - ld.global.v2.f32 {%f430, %f431}, [%r149+85983232]; - ld.global.v2.f32 {%f436, %f437}, [%r149+88080384]; - ld.global.v2.f32 {%f442, %f443}, [%r149+90177536]; - ld.global.v2.f32 {%f448, %f449}, [%r149+92274688]; - ld.global.v2.f32 {%f454, %f455}, [%r149+94371840]; - ld.global.v2.f32 {%f460, %f461}, [%r149+96468992]; - ld.global.v2.f32 {%f466, %f467}, [%r149+98566144]; - ld.param.u32 %r175, [complexVoltages_param_0]; - add.s32 %r205, %r175, %r96; - mov.u32 %r198, 0; - -BB0_15: - mov.u32 %r203, %r205; - mov.u32 %r59, %r203; - shl.b32 %r61, %r198, 4; - // inline asm - mov.u32 %r159, %tid.x; - // inline asm - // inline asm - mov.u32 %r160, %tid.y; - // inline asm - shl.b32 %r63, %r160, 1; - add.s32 %r200, %r63, %r159; - setp.gt.u32 %p9, %r200, 255; - @%p9 bra BB0_18; - - add.s32 %r65, %r159, %r63; - and.b32 %r162, %r65, 15; - add.s32 %r66, %r61, %r162; - mov.u32 %r199, 0; - -BB0_17: - mov.u32 %r68, %r200; - add.s32 %r163, %r65, %r199; - shl.b32 %r164, %r66, 4; - shl.b32 %r165, %r10, 9; - shl.b32 %r166, %r68, 4; - add.s32 %r168, %r97, %r166; - and.b32 %r169, %r163, 65520; - shl.b32 %r170, %r169, 16; - ld.param.u32 %r177, [complexVoltages_param_1]; - add.s32 %r171, %r170, %r177; - add.s32 %r172, %r171, %r165; - add.s32 %r173, %r172, %r164; - ld.global.v4.f32 {%f372, %f373, %f374, %f375}, [%r173+33554432]; - st.shared.v4.f32 [%r168], {%f372, %f373, %f374, %f375}; - add.s32 %r69, %r68, 256; - add.s32 %r199, %r199, 256; - setp.gt.u32 %p10, %r68, -257; - mov.u32 %r200, %r69; - @%p10 bra BB0_17; - -BB0_18: - mov.u32 %r202, %r12; - bar.sync 0; - mov.u32 %r201, 16; - mov.u32 %r204, %r59; - -BB0_19: - mov.u32 %r74, %r204; - ld.shared.v2.f32 {%f146, %f147}, [%r202+-2048]; - ld.global.v2.f32 {%f148, %f149}, [%r74]; - fma.rn.ftz.f32 %f150, %f376, %f146, %f148; - fma.rn.ftz.f32 %f151, %f376, %f147, %f149; - neg.ftz.f32 %f99, %f147; - fma.rn.ftz.f32 %f158, %f377, %f99, %f150; - fma.rn.ftz.f32 %f159, %f377, %f146, %f151; - ld.shared.v2.f32 {%f162, %f163}, [%r202+-1792]; - fma.rn.ftz.f32 %f164, %f382, %f162, %f158; - fma.rn.ftz.f32 %f165, %f382, %f163, %f159; - neg.ftz.f32 %f102, %f163; - fma.rn.ftz.f32 %f172, %f383, %f102, %f164; - fma.rn.ftz.f32 %f173, %f383, %f162, %f165; - ld.shared.v2.f32 {%f176, %f177}, [%r202+-1536]; - fma.rn.ftz.f32 %f178, %f388, %f176, %f172; - fma.rn.ftz.f32 %f179, %f388, %f177, %f173; - neg.ftz.f32 %f105, %f177; - fma.rn.ftz.f32 %f186, %f389, %f105, %f178; - fma.rn.ftz.f32 %f187, %f389, %f176, %f179; - ld.shared.v2.f32 {%f190, %f191}, [%r202+-1280]; - fma.rn.ftz.f32 %f192, %f394, %f190, %f186; - fma.rn.ftz.f32 %f193, %f394, %f191, %f187; - neg.ftz.f32 %f108, %f191; - fma.rn.ftz.f32 %f200, %f395, %f108, %f192; - fma.rn.ftz.f32 %f201, %f395, %f190, %f193; - ld.shared.v2.f32 {%f204, %f205}, [%r202+-1024]; - fma.rn.ftz.f32 %f206, %f400, %f204, %f200; - fma.rn.ftz.f32 %f207, %f400, %f205, %f201; - neg.ftz.f32 %f111, %f205; - fma.rn.ftz.f32 %f214, %f401, %f111, %f206; - fma.rn.ftz.f32 %f215, %f401, %f204, %f207; - ld.shared.v2.f32 {%f218, %f219}, [%r202+-768]; - fma.rn.ftz.f32 %f220, %f406, %f218, %f214; - fma.rn.ftz.f32 %f221, %f406, %f219, %f215; - neg.ftz.f32 %f114, %f219; - fma.rn.ftz.f32 %f228, %f407, %f114, %f220; - fma.rn.ftz.f32 %f229, %f407, %f218, %f221; - ld.shared.v2.f32 {%f232, %f233}, [%r202+-512]; - fma.rn.ftz.f32 %f234, %f412, %f232, %f228; - fma.rn.ftz.f32 %f235, %f412, %f233, %f229; - neg.ftz.f32 %f117, %f233; - fma.rn.ftz.f32 %f242, %f413, %f117, %f234; - fma.rn.ftz.f32 %f243, %f413, %f232, %f235; - ld.shared.v2.f32 {%f246, %f247}, [%r202+-256]; - fma.rn.ftz.f32 %f248, %f418, %f246, %f242; - fma.rn.ftz.f32 %f249, %f418, %f247, %f243; - neg.ftz.f32 %f120, %f247; - fma.rn.ftz.f32 %f256, %f419, %f120, %f248; - fma.rn.ftz.f32 %f257, %f419, %f246, %f249; - ld.shared.v2.f32 {%f260, %f261}, [%r202]; - fma.rn.ftz.f32 %f262, %f424, %f260, %f256; - fma.rn.ftz.f32 %f263, %f424, %f261, %f257; - neg.ftz.f32 %f123, %f261; - fma.rn.ftz.f32 %f270, %f425, %f123, %f262; - fma.rn.ftz.f32 %f271, %f425, %f260, %f263; - ld.shared.v2.f32 {%f274, %f275}, [%r202+256]; - fma.rn.ftz.f32 %f276, %f430, %f274, %f270; - fma.rn.ftz.f32 %f277, %f430, %f275, %f271; - neg.ftz.f32 %f126, %f275; - fma.rn.ftz.f32 %f284, %f431, %f126, %f276; - fma.rn.ftz.f32 %f285, %f431, %f274, %f277; - ld.shared.v2.f32 {%f288, %f289}, [%r202+512]; - fma.rn.ftz.f32 %f290, %f436, %f288, %f284; - fma.rn.ftz.f32 %f291, %f436, %f289, %f285; - neg.ftz.f32 %f129, %f289; - fma.rn.ftz.f32 %f298, %f437, %f129, %f290; - fma.rn.ftz.f32 %f299, %f437, %f288, %f291; - ld.shared.v2.f32 {%f302, %f303}, [%r202+768]; - fma.rn.ftz.f32 %f304, %f442, %f302, %f298; - fma.rn.ftz.f32 %f305, %f442, %f303, %f299; - neg.ftz.f32 %f132, %f303; - fma.rn.ftz.f32 %f312, %f443, %f132, %f304; - fma.rn.ftz.f32 %f313, %f443, %f302, %f305; - ld.shared.v2.f32 {%f316, %f317}, [%r202+1024]; - fma.rn.ftz.f32 %f318, %f448, %f316, %f312; - fma.rn.ftz.f32 %f319, %f448, %f317, %f313; - neg.ftz.f32 %f135, %f317; - fma.rn.ftz.f32 %f326, %f449, %f135, %f318; - fma.rn.ftz.f32 %f327, %f449, %f316, %f319; - ld.shared.v2.f32 {%f330, %f331}, [%r202+1280]; - fma.rn.ftz.f32 %f332, %f454, %f330, %f326; - fma.rn.ftz.f32 %f333, %f454, %f331, %f327; - neg.ftz.f32 %f138, %f331; - fma.rn.ftz.f32 %f340, %f455, %f138, %f332; - fma.rn.ftz.f32 %f341, %f455, %f330, %f333; - ld.shared.v2.f32 {%f344, %f345}, [%r202+1536]; - fma.rn.ftz.f32 %f346, %f460, %f344, %f340; - fma.rn.ftz.f32 %f347, %f460, %f345, %f341; - neg.ftz.f32 %f141, %f345; - fma.rn.ftz.f32 %f354, %f461, %f141, %f346; - fma.rn.ftz.f32 %f355, %f461, %f344, %f347; - ld.shared.v2.f32 {%f358, %f359}, [%r202+1792]; - fma.rn.ftz.f32 %f360, %f466, %f358, %f354; - fma.rn.ftz.f32 %f361, %f466, %f359, %f355; - neg.ftz.f32 %f144, %f359; - fma.rn.ftz.f32 %f368, %f467, %f144, %f360; - fma.rn.ftz.f32 %f369, %f467, %f358, %f361; - st.global.v2.f32 [%r74], {%f368, %f369}; - add.s32 %r75, %r74, 2048; - add.s32 %r202, %r202, 16; - add.s32 %r201, %r201, -1; - setp.ne.s32 %p11, %r201, 0; - mov.u32 %r204, %r75; - @%p11 bra BB0_19; - - bar.sync 0; - add.s32 %r198, %r198, 1; - add.s32 %r79, %r59, 32768; - setp.ne.s32 %p12, %r198, 2; - mov.u32 %r205, %r79; - @%p12 bra BB0_15; - - ret; -} - - diff --git a/RTCP/GPUProc/src/BeamFormer/CoherentStokes.cl-0.ptx b/RTCP/GPUProc/src/BeamFormer/CoherentStokes.cl-0.ptx deleted file mode 100644 index 95682a21f13f73a46e51290282c947066bcf9e94..0000000000000000000000000000000000000000 Binary files a/RTCP/GPUProc/src/BeamFormer/CoherentStokes.cl-0.ptx and /dev/null differ diff --git a/RTCP/GPUProc/src/BeamFormer/Dedispersion.cl-0.ptx b/RTCP/GPUProc/src/BeamFormer/Dedispersion.cl-0.ptx deleted file mode 100644 index 221a542fb5fdfc1737730694aaf0921d00774dfb..0000000000000000000000000000000000000000 --- a/RTCP/GPUProc/src/BeamFormer/Dedispersion.cl-0.ptx +++ /dev/null @@ -1,417 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Sat Sep 29 10:12:13 2012 (1348906333) -// Driver 304.54 -// - -.version 3.0 -.target sm_30, texmode_independent -.address_size 32 - -.extern .shared .align 4 .b8 shr_2_local_DMs[512]; - -.entry applyChirp( - .param .u32 .ptr .global .align 1 applyChirp_param_0, - .param .u32 .ptr .global .align 4 applyChirp_param_1, - .param .f32 applyChirp_param_2 -) -{ - .reg .f32 %f<225>; - .reg .pred %p<26>; - .reg .s32 %r<97>; - - - // inline asm - mov.u32 %r32, %tid.x; - // inline asm - setp.gt.s32 %p4, %r32, 127; - mov.u32 %r92, %r32; - @%p4 bra BB0_2; - -BB0_1: - shl.b32 %r34, %r92, 2; - ld.param.u32 %r91, [applyChirp_param_1]; - add.s32 %r35, %r91, %r34; - ld.global.f32 %f29, [%r35]; - add.ftz.f32 %f30, %f29, %f29; - mul.ftz.f32 %f31, %f30, 0f40490FDB; - mul.ftz.f32 %f32, %f31, 0f596BD7E5; - mov.u32 %r36, shr_2_local_DMs; - add.s32 %r37, %r36, %r34; - st.shared.f32 [%r37], %f32; - // inline asm - mov.u32 %r33, %ntid.x; - // inline asm - add.s32 %r92, %r33, %r92; - setp.lt.s32 %p5, %r92, 128; - @%p5 bra BB0_1; - -BB0_2: - bar.sync 0; - // inline asm - mov.u32 %r38, %envreg3; - // inline asm - // inline asm - mov.u32 %r39, %ntid.x; - // inline asm - // inline asm - mov.u32 %r40, %ctaid.x; - // inline asm - // inline asm - mov.u32 %r41, %tid.x; - // inline asm - add.s32 %r50, %r41, %r38; - mad.lo.s32 %r51, %r40, %r39, %r50; - // inline asm - mov.u32 %r42, %envreg4; - // inline asm - // inline asm - mov.u32 %r43, %ntid.y; - // inline asm - // inline asm - mov.u32 %r44, %ctaid.y; - // inline asm - // inline asm - mov.u32 %r45, %tid.y; - // inline asm - // inline asm - mov.u32 %r46, %envreg5; - // inline asm - // inline asm - mov.u32 %r47, %ntid.z; - // inline asm - // inline asm - mov.u32 %r48, %ctaid.z; - // inline asm - // inline asm - mov.u32 %r49, %tid.z; - // inline asm - add.s32 %r52, %r49, %r46; - mad.lo.s32 %r53, %r48, %r47, %r52; - cvt.rn.f32.u32 %f35, %r53; - ld.param.f32 %f217, [applyChirp_param_2]; - add.ftz.f32 %f36, %f217, 0fC7BEBC20; - fma.rn.ftz.f32 %f2, %f35, 0f42BEBC20, %f36; - cvt.rn.f32.u32 %f37, %r51; - mul.ftz.f32 %f38, %f37, 0f403EBC20; - setp.gt.u32 %p6, %r51, 32; - fma.rn.ftz.f32 %f39, %f37, 0f403EBC20, 0fC2BEBC20; - selp.f32 %f3, %f39, %f38, %p6; - div.rn.ftz.f32 %f34, %f3, 0f42334A70; - // inline asm - abs.f32 %f33, %f34; - // inline asm - setp.eq.ftz.f32 %p7, %f34, 0f3F800000; - @%p7 bra BB0_25; - - setp.nan.ftz.f32 %p8, %f34, %f34; - @%p8 bra BB0_24; - - mov.f32 %f6, 0fFF800000; - mov.f32 %f220, 0f42A00000; - mov.f32 %f44, 0f3F000000; - mul.rn.f32 %f41, %f44, %f220; - // inline asm - cvt.rmi.f32.f32 %f40, %f41; - // inline asm - mov.f32 %f45, 0f40000000; - mul.rn.f32 %f46, %f45, %f40; - sub.ftz.f32 %f47, %f220, %f46; - setp.eq.ftz.f32 %p1, %f47, 0f3F800000; - // inline asm - cvt.rzi.f32.f32 %f42, %f220; - // inline asm - setp.eq.ftz.f32 %p2, %f42, 0f42A00000; - and.pred %p3, %p1, %p2; - setp.eq.ftz.f32 %p9, %f33, 0f00000000; - @%p9 bra BB0_23; - - setp.eq.ftz.f32 %p10, %f34, 0f7F800000; - setp.eq.ftz.f32 %p11, %f34, %f6; - or.pred %p12, %p10, %p11; - @%p12 bra BB0_20; - - setp.geu.ftz.f32 %p13, %f34, 0f00000000; - @%p13 bra BB0_8; - - mov.f32 %f49, 0f42A00000; - // inline asm - cvt.rzi.f32.f32 %f48, %f49; - // inline asm - setp.neu.ftz.f32 %p14, %f48, 0f42A00000; - @%p14 bra BB0_19; - -BB0_8: - // inline asm - abs.f32 %f50, %f34; - // inline asm - mov.b32 %r18, %f50; - shr.u32 %r54, %r18, 23; - and.b32 %r55, %r54, 255; - add.s32 %r93, %r55, -127; - setp.eq.s32 %p15, %r55, 0; - mov.f32 %f218, %f50; - @%p15 bra BB0_9; - bra.uni BB0_10; - -BB0_9: - and.b32 %r56, %r18, -2139095041; - or.b32 %r57, %r56, 1065353216; - mov.b32 %f52, %r57; - add.ftz.f32 %f53, %f52, 0fBF800000; - mov.b32 %r58, %f53; - shr.u32 %r59, %r58, 23; - and.b32 %r60, %r59, 255; - add.s32 %r93, %r60, -253; - and.b32 %r61, %r58, -2139095041; - or.b32 %r62, %r61, 1065353216; - mov.b32 %f218, %r62; - -BB0_10: - mov.b32 %r63, %f218; - and.b32 %r64, %r63, -2139095041; - or.b32 %r65, %r64, 1065353216; - mov.b32 %f219, %r65; - setp.gt.ftz.f32 %p16, %f219, 0f3FB504F3; - @%p16 bra BB0_11; - bra.uni BB0_12; - -BB0_11: - mul.rn.f32 %f219, %f219, %f44; - add.s32 %r93, %r93, 1; - -BB0_12: - add.ftz.f32 %f63, %f219, 0f3F800000; - rcp.approx.ftz.f32 %f57, %f63; - add.ftz.f32 %f56, %f219, 0fBF800000; - // inline asm - mul.rz.f32 %f55, %f56, %f57; - // inline asm - mul.rn.f32 %f65, %f45, %f55; - mul.rn.f32 %f66, %f65, %f65; - mov.f32 %f67, 0f3B18F0FE; - mul.rn.f32 %f68, %f67, %f66; - add.ftz.f32 %f69, %f68, 0f3C4CAF63; - mul.rn.f32 %f70, %f69, %f66; - add.ftz.f32 %f71, %f70, 0f3DAAAABD; - mul.rn.f32 %f72, %f71, %f66; - mul.rn.f32 %f60, %f72, %f65; - mov.b32 %r66, %f65; - and.b32 %r67, %r66, -4096; - mov.b32 %f73, %r67; - mov.b32 %r68, %f56; - and.b32 %r69, %r68, -4096; - mov.b32 %f74, %r69; - sub.ftz.f32 %f75, %f56, %f73; - mul.rn.f32 %f76, %f45, %f75; - sub.ftz.f32 %f77, %f56, %f74; - mul.rn.f32 %f78, %f73, %f74; - sub.ftz.f32 %f79, %f76, %f78; - mul.rn.f32 %f80, %f73, %f77; - sub.ftz.f32 %f81, %f79, %f80; - mul.rn.f32 %f82, %f57, %f81; - add.ftz.f32 %f83, %f73, %f82; - sub.ftz.f32 %f84, %f83, %f73; - sub.ftz.f32 %f85, %f82, %f84; - add.ftz.f32 %f86, %f83, %f60; - neg.ftz.f32 %f59, %f60; - // inline asm - add.rz.f32 %f58, %f59, %f60; - // inline asm - add.ftz.f32 %f87, %f58, %f85; - add.ftz.f32 %f88, %f86, %f87; - sub.ftz.f32 %f89, %f87, %f87; - cvt.rn.f32.s32 %f90, %r93; - mov.f32 %f91, 0f3F317200; - mul.rn.f32 %f92, %f90, %f91; - mov.f32 %f93, 0f35BFBE8E; - mul.rn.f32 %f94, %f90, %f93; - add.ftz.f32 %f95, %f92, %f88; - sub.ftz.f32 %f96, %f88, %f88; - add.ftz.f32 %f97, %f96, %f89; - add.ftz.f32 %f98, %f97, %f94; - add.ftz.f32 %f13, %f95, %f98; - sub.ftz.f32 %f14, %f98, %f98; - // inline asm - abs.f32 %f61, %f220; - // inline asm - setp.gt.ftz.f32 %p17, %f61, 0f77F684DF; - @%p17 bra BB0_13; - bra.uni BB0_14; - -BB0_13: - mov.f32 %f100, 0f39000000; - mov.f32 %f101, 0f42A00000; - mul.rn.f32 %f220, %f101, %f100; - -BB0_14: - mov.f32 %f102, 0f45800800; - mul.rn.f32 %f103, %f13, %f102; - sub.ftz.f32 %f104, %f13, %f103; - add.ftz.f32 %f105, %f104, %f103; - sub.ftz.f32 %f106, %f13, %f105; - mul.rn.f32 %f107, %f220, %f102; - sub.ftz.f32 %f108, %f220, %f107; - add.ftz.f32 %f109, %f108, %f107; - sub.ftz.f32 %f110, %f220, %f109; - mul.rn.f32 %f111, %f105, %f109; - mul.rn.f32 %f112, %f13, %f220; - sub.ftz.f32 %f113, %f111, %f112; - mul.rn.f32 %f114, %f105, %f110; - add.ftz.f32 %f115, %f113, %f114; - mul.rn.f32 %f116, %f106, %f109; - add.ftz.f32 %f117, %f115, %f116; - mul.rn.f32 %f118, %f106, %f110; - add.ftz.f32 %f119, %f117, %f118; - mul.rn.f32 %f120, %f14, %f220; - add.ftz.f32 %f121, %f120, %f119; - add.ftz.f32 %f122, %f112, %f121; - sub.ftz.f32 %f17, %f121, %f121; - mov.f32 %f223, %f17; - mov.f32 %f224, %f122; - mov.b32 %r24, %f122; - setp.eq.s32 %p18, %r24, 1118925336; - @%p18 bra BB0_15; - bra.uni BB0_16; - -BB0_15: - add.s32 %r70, %r24, -1; - mov.b32 %f123, %r70; - add.ftz.f32 %f124, %f17, 0f37000000; - mov.f32 %f223, %f124; - mov.f32 %f224, %f123; - -BB0_16: - // inline asm - mul.f32 %f125, %f224, 0f3FB8AA3B;ex2.approx.f32 %f125, %f125; - // inline asm - setp.neu.ftz.f32 %p19, %f125, 0f7F800000; - mov.f32 %f221, %f125; - @%p19 bra BB0_17; - bra.uni BB0_18; - -BB0_17: - // inline asm - mad.f32 %f127, %f125, %f223, %f125; - // inline asm - mov.f32 %f221, %f127; - -BB0_18: - not.pred %p21, %p3; - or.pred %p23, %p13, %p21; - mov.b32 %r71, %f221; - xor.b32 %r72, %r71, -2147483648; - mov.b32 %f131, %r72; - selp.f32 %f222, %f221, %f131, %p23; - bra.uni BB0_26; - -BB0_19: - mov.f32 %f222, 0f7FFFFFFF; - bra.uni BB0_26; - -BB0_20: - mov.b32 %r73, %f34; - setp.lt.s32 %p24, %r73, 0; - @%p24 bra BB0_22; - - mov.f32 %f222, 0f7F800000; - bra.uni BB0_26; - -BB0_22: - selp.f32 %f222, 0fFF800000, 0f7F800000, %p3; - bra.uni BB0_26; - -BB0_23: - mov.b32 %r74, %f34; - and.b32 %r75, %r74, -2147483648; - mov.b32 %f132, %r75; - selp.f32 %f222, %f132, 0f00000000, %p3; - bra.uni BB0_26; - -BB0_24: - add.ftz.f32 %f222, %f34, 0f42A00000; - bra.uni BB0_26; - -BB0_25: - mov.f32 %f222, 0f3F800000; - -BB0_26: - add.ftz.f32 %f135, %f222, 0f3F800000; - // inline asm - rsqrt.approx.f32 %f134, %f135; - // inline asm - mul.ftz.f32 %f136, %f134, 0f42000000; - div.approx.ftz.f32 %f137, %f3, %f2; - mul.ftz.f32 %f138, %f137, %f137; - add.ftz.f32 %f139, %f2, %f3; - div.approx.ftz.f32 %f28, %f138, %f139; - add.s32 %r78, %r45, %r42; - mad.lo.s32 %r79, %r44, %r43, %r78; - shl.b32 %r80, %r79, 8; - shl.b32 %r83, %r53, 8; - add.s32 %r84, %r80, %r83; - shl.b32 %r87, %r51, 3; - add.s32 %r88, %r84, %r87; - ld.param.u32 %r90, [applyChirp_param_0]; - add.s32 %r89, %r88, %r90; - add.s32 %r95, %r89, 1048576; - mov.u32 %r96, 128; - mov.u32 %r94, shr_2_local_DMs; - -BB0_27: - ld.global.v2.f32 {%f181, %f182}, [%r95+-1048576]; - ld.global.v2.f32 {%f183, %f184}, [%r95+-524288]; - ld.shared.f32 %f149, [%r94]; - mul.ftz.f32 %f144, %f149, %f28; - // inline asm - cos.approx.f32 %f141, %f144; - // inline asm - // inline asm - sin.approx.f32 %f143, %f144; - // inline asm - mul.ftz.f32 %f187, %f141, %f136; - mul.ftz.f32 %f188, %f143, %f136; - mul.ftz.f32 %f152, %f187, %f181; - neg.f32 %f155, %f188; - fma.rn.ftz.f32 %f156, %f155, %f182, %f152; - mul.ftz.f32 %f157, %f188, %f181; - fma.rn.ftz.f32 %f158, %f187, %f182, %f157; - st.global.v2.f32 [%r95+-1048576], {%f156, %f158}; - mul.ftz.f32 %f160, %f187, %f183; - fma.rn.ftz.f32 %f162, %f155, %f184, %f160; - mul.ftz.f32 %f163, %f188, %f183; - fma.rn.ftz.f32 %f164, %f187, %f184, %f163; - st.global.v2.f32 [%r95+-524288], {%f162, %f164}; - ld.global.v2.f32 {%f195, %f196}, [%r95]; - ld.global.v2.f32 {%f197, %f198}, [%r95+524288]; - ld.shared.f32 %f165, [%r94+4]; - mul.ftz.f32 %f148, %f165, %f28; - // inline asm - cos.approx.f32 %f145, %f148; - // inline asm - // inline asm - sin.approx.f32 %f147, %f148; - // inline asm - mul.ftz.f32 %f201, %f145, %f136; - mul.ftz.f32 %f202, %f147, %f136; - mul.ftz.f32 %f168, %f201, %f195; - neg.f32 %f171, %f202; - fma.rn.ftz.f32 %f172, %f171, %f196, %f168; - mul.ftz.f32 %f173, %f202, %f195; - fma.rn.ftz.f32 %f174, %f201, %f196, %f173; - st.global.v2.f32 [%r95], {%f172, %f174}; - mul.ftz.f32 %f176, %f201, %f197; - fma.rn.ftz.f32 %f178, %f171, %f198, %f176; - mul.ftz.f32 %f179, %f202, %f197; - fma.rn.ftz.f32 %f180, %f201, %f198, %f179; - st.global.v2.f32 [%r95+524288], {%f178, %f180}; - add.s32 %r95, %r95, 2097152; - add.s32 %r94, %r94, 8; - add.s32 %r96, %r96, -2; - setp.ne.s32 %p25, %r96, 0; - @%p25 bra BB0_27; - - ret; -} - - diff --git a/RTCP/GPUProc/src/BeamFormer/IncoherentStokes.cl-0.ptx b/RTCP/GPUProc/src/BeamFormer/IncoherentStokes.cl-0.ptx deleted file mode 100644 index be0613c10bbe025940d0054bfd9e8579149984e3..0000000000000000000000000000000000000000 Binary files a/RTCP/GPUProc/src/BeamFormer/IncoherentStokes.cl-0.ptx and /dev/null differ diff --git a/RTCP/GPUProc/src/BeamFormer/IntToFloat.cl-0.ptx b/RTCP/GPUProc/src/BeamFormer/IntToFloat.cl-0.ptx deleted file mode 100644 index ce5c60b9ac9b08ef14ba9402e30163b4bede74bb..0000000000000000000000000000000000000000 --- a/RTCP/GPUProc/src/BeamFormer/IntToFloat.cl-0.ptx +++ /dev/null @@ -1,82 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Sat Sep 29 10:12:13 2012 (1348906333) -// Driver 304.54 -// - -.version 3.0 -.target sm_30, texmode_independent -.address_size 32 - - -.entry intToFloat( - .param .u32 .ptr .global .align 1 intToFloat_param_0, - .param .u32 .ptr .global .align 1 intToFloat_param_1 -) -{ - .reg .f32 %f<9>; - .reg .pred %p<3>; - .reg .s32 %r<25>; - .reg .s16 %rc<9>; - - - // inline asm - mov.u32 %r7, %envreg4; - // inline asm - // inline asm - mov.u32 %r8, %ntid.y; - // inline asm - // inline asm - mov.u32 %r9, %ctaid.y; - // inline asm - // inline asm - mov.u32 %r10, %tid.y; - // inline asm - add.s32 %r12, %r10, %r7; - mad.lo.s32 %r3, %r9, %r8, %r12; - // inline asm - mov.u32 %r11, %tid.x; - // inline asm - setp.gt.u32 %p1, %r11, 65535; - mov.u32 %r24, %r11; - @%p1 bra BB0_2; - -BB0_1: - shl.b32 %r14, %r3, 18; - ld.param.u32 %r23, [intToFloat_param_1]; - add.s32 %r15, %r23, %r14; - shl.b32 %r16, %r24, 2; - add.s32 %r17, %r15, %r16; - ld.global.v2.u8 {%rc5, %rc6}, [%r17]; - // inline asm - cvt.rn.f32.s8 %f1, %rc5; - // inline asm - // inline asm - cvt.rn.f32.s8 %f2, %rc6; - // inline asm - shl.b32 %r18, %r3, 20; - ld.param.u32 %r22, [intToFloat_param_0]; - add.s32 %r19, %r22, %r18; - shl.b32 %r20, %r24, 3; - add.s32 %r21, %r19, %r20; - st.global.v2.f32 [%r21], {%f1, %f2}; - ld.global.v2.u8 {%rc7, %rc8}, [%r17+2]; - // inline asm - cvt.rn.f32.s8 %f3, %rc7; - // inline asm - // inline asm - cvt.rn.f32.s8 %f4, %rc8; - // inline asm - st.global.v2.f32 [%r21+524288], {%f3, %f4}; - // inline asm - mov.u32 %r13, %ntid.x; - // inline asm - add.s32 %r24, %r13, %r24; - setp.lt.u32 %p2, %r24, 65536; - @%p2 bra BB0_1; - -BB0_2: - ret; -} - - diff --git a/RTCP/GPUProc/src/BeamFormer/Transpose.cl-0.ptx b/RTCP/GPUProc/src/BeamFormer/Transpose.cl-0.ptx deleted file mode 100644 index 26a5c9cca9b954543c29abfbb6bb29d45835a803..0000000000000000000000000000000000000000 --- a/RTCP/GPUProc/src/BeamFormer/Transpose.cl-0.ptx +++ /dev/null @@ -1,134 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Sat Sep 29 10:12:13 2012 (1348906333) -// Driver 304.54 -// - -.version 3.0 -.target sm_30, texmode_independent -.address_size 32 - -.extern .shared .align 16 .b8 shr_3_tmp[4352]; - -.entry transposeComplexVoltages( - .param .u32 .ptr .global .align 1 transposeComplexVoltages_param_0, - .param .u32 .ptr .global .align 1 transposeComplexVoltages_param_1 -) -{ - .reg .f32 %f<65>; - .reg .pred %p<2>; - .reg .s32 %r<58>; - - - ld.param.u32 %r26, [transposeComplexVoltages_param_0]; - ld.param.u32 %r27, [transposeComplexVoltages_param_1]; - // inline asm - mov.u32 %r13, %envreg4; - // inline asm - // inline asm - mov.u32 %r14, %ntid.y; - // inline asm - // inline asm - mov.u32 %r15, %ctaid.y; - // inline asm - // inline asm - mov.u32 %r16, %tid.y; - // inline asm - // inline asm - mov.u32 %r17, %envreg5; - // inline asm - // inline asm - mov.u32 %r18, %ntid.z; - // inline asm - // inline asm - mov.u32 %r19, %ctaid.z; - // inline asm - // inline asm - mov.u32 %r20, %tid.z; - // inline asm - // inline asm - mov.u32 %r21, %tid.x; - // inline asm - // inline asm - mov.u32 %r22, %tid.x; - // inline asm - shr.u32 %r28, %r22, 4; - // inline asm - mov.u32 %r23, %tid.x; - // inline asm - shr.u32 %r29, %r23, 4; - // inline asm - mov.u32 %r24, %tid.x; - // inline asm - and.b32 %r30, %r21, 15; - mov.u32 %r31, shr_3_tmp; - mad.lo.s32 %r32, %r30, 272, %r31; - and.b32 %r33, %r22, -16; - add.s32 %r1, %r32, %r33; - mad.lo.s32 %r34, %r29, 272, %r31; - and.b32 %r35, %r24, 15; - shl.b32 %r36, %r35, 4; - add.s32 %r2, %r34, %r36; - add.s32 %r37, %r20, %r17; - mad.lo.s32 %r38, %r19, %r18, %r37; - shl.b32 %r39, %r38, 4; - add.s32 %r40, %r28, %r39; - shl.b32 %r41, %r40, 16; - add.s32 %r42, %r16, %r13; - mad.lo.s32 %r43, %r15, %r14, %r42; - shl.b32 %r44, %r43, 4; - add.s32 %r45, %r44, %r30; - shl.b32 %r46, %r45, 4; - add.s32 %r47, %r41, %r46; - add.s32 %r48, %r47, %r27; - add.s32 %r56, %r48, 4096; - add.s32 %r49, %r29, %r44; - shl.b32 %r50, %r49, 20; - add.s32 %r51, %r39, %r35; - shl.b32 %r52, %r51, 3; - add.s32 %r53, %r50, %r52; - add.s32 %r54, %r53, %r26; - add.s32 %r55, %r54, 525056; - mov.u32 %r57, 2048; - -BB0_1: - add.s32 %r8, %r56, -4096; - ld.global.v4.f32 {%f61, %f62, %f63, %f64}, [%r56+-4096]; - st.shared.v4.f32 [%r1], {%f61, %f62, %f63, %f64}; - bar.sync 0; - ld.shared.v4.f32 {%f53, %f54, %f55, %f56}, [%r2]; - add.s32 %r9, %r55, -525056; - st.global.v2.f32 [%r55+-525056], {%f53, %f54}; - st.global.v2.f32 [%r55+-768], {%f55, %f56}; - bar.sync 0; - ld.global.v4.f32 {%f49, %f50, %f51, %f52}, [%r8+2048]; - st.shared.v4.f32 [%r1], {%f49, %f50, %f51, %f52}; - bar.sync 0; - ld.shared.v4.f32 {%f41, %f42, %f43, %f44}, [%r2]; - st.global.v2.f32 [%r9+256], {%f41, %f42}; - st.global.v2.f32 [%r9+524544], {%f43, %f44}; - bar.sync 0; - ld.global.v4.f32 {%f37, %f38, %f39, %f40}, [%r8+4096]; - st.shared.v4.f32 [%r1], {%f37, %f38, %f39, %f40}; - bar.sync 0; - ld.shared.v4.f32 {%f29, %f30, %f31, %f32}, [%r2]; - st.global.v2.f32 [%r9+512], {%f29, %f30}; - st.global.v2.f32 [%r9+524800], {%f31, %f32}; - bar.sync 0; - ld.global.v4.f32 {%f25, %f26, %f27, %f28}, [%r8+6144]; - st.shared.v4.f32 [%r1], {%f25, %f26, %f27, %f28}; - bar.sync 0; - ld.shared.v4.f32 {%f17, %f18, %f19, %f20}, [%r2]; - st.global.v2.f32 [%r9+768], {%f17, %f18}; - st.global.v2.f32 [%r9+525056], {%f19, %f20}; - bar.sync 0; - add.s32 %r56, %r56, 8192; - add.s32 %r55, %r55, 1024; - add.s32 %r57, %r57, -4; - setp.ne.s32 %p1, %r57, 0; - @%p1 bra BB0_1; - - ret; -} - - diff --git a/RTCP/GPUProc/src/CL/cl.hpp b/RTCP/GPUProc/src/CL/cl.hpp index 5111f37dc8c74a540999ff6a06b9ae7ec07b6455..aa2b0bd040fd81ed57f433c242dcc1b39693f35a 100644 --- a/RTCP/GPUProc/src/CL/cl.hpp +++ b/RTCP/GPUProc/src/CL/cl.hpp @@ -1,4 +1,4 @@ -#include <iostream> +//#include <iostream> /******************************************************************************* * Copyright (c) 2008-2011 The Khronos Group Inc. * @@ -2069,9 +2069,9 @@ public: { cl_int error; object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); -if (!(flags & CL_MEM_ALLOC_HOST_PTR)) -#pragma omp critical (cout) -std::cout << "A: Allocating " << size << " bytes" << std::endl; +//if (!(flags & CL_MEM_ALLOC_HOST_PTR)) +//#pragma omp critical (cout) +//std::cout << "A: Allocating " << size << " bytes" << std::endl; detail::errHandler(error, __CREATE_BUFFER_ERR); if (err != NULL) { @@ -2090,9 +2090,9 @@ std::cout << "A: Allocating " << size << " bytes" << std::endl; Context context = Context::getDefault(err); object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); -if (!(flags & CL_MEM_ALLOC_HOST_PTR)) -#pragma omp critical (cout) -std::cout << "B: Allocating " << size << " bytes" << std::endl; +//if (!(flags & CL_MEM_ALLOC_HOST_PTR)) +//#pragma omp critical (cout) +//std::cout << "B: Allocating " << size << " bytes" << std::endl; detail::errHandler(error, __CREATE_BUFFER_ERR); if (err != NULL) { diff --git a/RTCP/GPUProc/src/Correlator.cl b/RTCP/GPUProc/src/Correlator.cl index 9ca1ce91620cc5927211907be030ff33626164ce..5d41253f40d8ca7cabe4bc7a640ab1309ea913fd 100644 --- a/RTCP/GPUProc/src/Correlator.cl +++ b/RTCP/GPUProc/src/Correlator.cl @@ -1,7 +1,7 @@ #define NR_BASELINES (NR_STATIONS * (NR_STATIONS + 1) / 2) #if NR_STATIONS == 288 -#define BLOCK_SIZE 8 +#define BLOCK_SIZE 6 #elif defined NVIDIA_CUDA && NR_SAMPLES_PER_CHANNEL % 24 == 0 #define BLOCK_SIZE 24 #else @@ -24,7 +24,7 @@ __kernel void correlate(__global void *visibilitiesPtr, __local float samples[4][BLOCK_SIZE][NR_STATIONS | 1]; // avoid power-of-2 uint baseline = get_global_id(0); - uint channel = get_global_id(1); + uint channel = get_global_id(1) + 1; uint stat_0 = convert_uint_rtz(sqrt(convert_float(8 * baseline + 1)) - 0.99999f) / 2; uint stat_A = baseline - stat_0 * (stat_0 + 1) / 2; @@ -84,7 +84,7 @@ __kernel void correlate_2x2(__global void *visibilitiesPtr, __local float4 samples[2][BLOCK_SIZE][(NR_STATIONS + 1) / 2 | 1]; // avoid power-of-2 - uint channel = get_global_id(1); + uint channel = get_global_id(1) + 1; uint block = get_global_id(0); uint x = convert_uint_rtz(sqrt(convert_float(8 * block + 1)) - 0.99999f) / 2; @@ -181,7 +181,7 @@ __kernel void correlate_3x3(__global void *visibilitiesPtr, __local float4 samples[3][BLOCK_SIZE][(NR_STATIONS + 2) / 3 | 1]; // avoid power-of-2 - uint channel = get_global_id(1); + uint channel = get_global_id(1) + 1; uint block = get_global_id(0); uint x = convert_uint_rtz(sqrt(convert_float(8 * block + 1)) - 0.99999f) / 2; @@ -338,7 +338,7 @@ __kernel void correlate_4x4(__global void *visibilitiesPtr, __local float4 samples[4][BLOCK_SIZE][(NR_STATIONS + 3) / 4 | 1]; // avoid power-of-2 - uint channel = get_global_id(1); + uint channel = get_global_id(1) + 1; uint block = get_global_id(0); uint x = convert_uint_rtz(sqrt(convert_float(8 * block + 1)) - 0.99999f) / 2; diff --git a/RTCP/GPUProc/src/Correlator.cl-0.ptx b/RTCP/GPUProc/src/Correlator.cl-0.ptx deleted file mode 100644 index 37d40c5127c10f59970bebd14f4800ba1f03e8a1..0000000000000000000000000000000000000000 Binary files a/RTCP/GPUProc/src/Correlator.cl-0.ptx and /dev/null differ diff --git a/RTCP/GPUProc/src/DelayAndBandPass.cl-0.ptx b/RTCP/GPUProc/src/DelayAndBandPass.cl-0.ptx deleted file mode 100644 index 765b24b939aaab3b67274c4aedf221c92af81818..0000000000000000000000000000000000000000 Binary files a/RTCP/GPUProc/src/DelayAndBandPass.cl-0.ptx and /dev/null differ diff --git a/RTCP/GPUProc/src/FIR.cl-0.ptx b/RTCP/GPUProc/src/FIR.cl-0.ptx deleted file mode 100644 index d1a6fd1785efd5436cb75094fa168a1692b1643d..0000000000000000000000000000000000000000 Binary files a/RTCP/GPUProc/src/FIR.cl-0.ptx and /dev/null differ diff --git a/RTCP/GPUProc/src/FilterBank.cc b/RTCP/GPUProc/src/FilterBank.cc index beacf7cbcdf86a2243fda08c43abef4096a3b253..b33756a68ba03a56684f7a1d9025a730374293c8 100644 --- a/RTCP/GPUProc/src/FilterBank.cc +++ b/RTCP/GPUProc/src/FilterBank.cc @@ -174,152 +174,155 @@ void FilterBank::interpolate(const double x[], const double y[], unsigned xlen, // but has better stopband characteristics. void FilterBank::generate_fir_filter(unsigned n, double w, const double window[], double result[]) { - // make sure grid is big enough for the window - // the grid must be at least (n+1)/2 - // for all filters where the order is a power of two minus 1, grid_n = n+1; - unsigned grid_n = nextPowerOfTwo(n + 1); +#pragma omp critical (FFTW) + { + // make sure grid is big enough for the window + // the grid must be at least (n+1)/2 + // for all filters where the order is a power of two minus 1, grid_n = n+1; + unsigned grid_n = nextPowerOfTwo(n + 1); - unsigned ramp_n = 2; // grid_n/20; + unsigned ramp_n = 2; // grid_n/20; - // Apply ramps to discontinuities - // this is a low pass filter - // maybe we can omit the "w, 0" point? - // I did observe a small difference - double f[] = { 0.0, w - ramp_n / grid_n / 2.0, w, w + ramp_n / grid_n / 2.0, 1.0 }; - double m[] = { 1.0, 1.0, 0.0, 0.0, 0.0 }; + // Apply ramps to discontinuities + // this is a low pass filter + // maybe we can omit the "w, 0" point? + // I did observe a small difference + double f[] = { 0.0, w - ramp_n / grid_n / 2.0, w, w + ramp_n / grid_n / 2.0, 1.0 }; + double m[] = { 1.0, 1.0, 0.0, 0.0, 0.0 }; - // grid is a 1-D array with grid_n+1 points. Values are 1 in filter passband, 0 otherwise - std::vector<double> grid(grid_n + 1); + // grid is a 1-D array with grid_n+1 points. Values are 1 in filter passband, 0 otherwise + std::vector<double> grid(grid_n + 1); - // interpolate between grid points - interpolate(f, m, 5 /* length of f and m arrays */, grid_n + 1, &grid[0]); + // interpolate between grid points + interpolate(f, m, 5 /* length of f and m arrays */, grid_n + 1, &grid[0]); #if 0 - std::stringstream logStr; - logStr << "interpolated = ["; - for(unsigned i=0; i<grid_n+1; i++) { - logStr << grid[i]; - if(i != grid_n+1-1) logStr << ", "; - } - logStr << "];"; - LOG_DEBUG(logStr.str()); + std::stringstream logStr; + logStr << "interpolated = ["; + for(unsigned i=0; i<grid_n+1; i++) { + logStr << grid[i]; + if(i != grid_n+1-1) logStr << ", "; + } + logStr << "];"; + LOG_DEBUG(logStr.str()); #endif - // the grid we do an ifft on is: - // grid appended with grid_n*2 zeros - // appended with original grid values from indices grid_n..2, i.e., the values in reverse order - // (note, arrays start at 1 in octave!) - // the input for the ifft is of size 4*grid_n - // input = [grid ; zeros(grid_n*2,1) ;grid(grid_n:-1:2)]; + // the grid we do an ifft on is: + // grid appended with grid_n*2 zeros + // appended with original grid values from indices grid_n..2, i.e., the values in reverse order + // (note, arrays start at 1 in octave!) + // the input for the ifft is of size 4*grid_n + // input = [grid ; zeros(grid_n*2,1) ;grid(grid_n:-1:2)]; #if defined HAVE_FFTW3 - fftwf_complex* cinput = (fftwf_complex*) fftwf_malloc(grid_n * 4 * sizeof(fftwf_complex)); - fftwf_complex* coutput = (fftwf_complex*) fftwf_malloc(grid_n * 4 * sizeof(fftwf_complex)); + fftwf_complex* cinput = (fftwf_complex*) fftwf_malloc(grid_n * 4 * sizeof(fftwf_complex)); + fftwf_complex* coutput = (fftwf_complex*) fftwf_malloc(grid_n * 4 * sizeof(fftwf_complex)); #elif defined HAVE_FFTW2 - fftw_complex* cinput = (fftw_complex*) fftw_malloc(grid_n*4*sizeof(fftw_complex)); - fftw_complex* coutput = (fftw_complex*) fftw_malloc(grid_n*4*sizeof(fftw_complex)); + fftw_complex* cinput = (fftw_complex*) fftw_malloc(grid_n*4*sizeof(fftw_complex)); + fftw_complex* coutput = (fftw_complex*) fftw_malloc(grid_n*4*sizeof(fftw_complex)); #endif - if (cinput == NULL || coutput == NULL) { - THROW(GPUProcException, "cannot allocate buffers"); - } + if (cinput == NULL || coutput == NULL) { + THROW(GPUProcException, "cannot allocate buffers"); + } - // wipe imaginary part - for (unsigned i = 0; i < grid_n * 4; i++) { - fftw_imag(cinput[i]) = 0.0; - } + // wipe imaginary part + for (unsigned i = 0; i < grid_n * 4; i++) { + fftw_imag(cinput[i]) = 0.0; + } - // copy first part of grid - for (unsigned i = 0; i < grid_n + 1; i++) { - fftw_real(cinput[i]) = grid[i]; - } + // copy first part of grid + for (unsigned i = 0; i < grid_n + 1; i++) { + fftw_real(cinput[i]) = grid[i]; + } - // append zeros - for (unsigned i = grid_n + 1; i <= grid_n * 3; i++) { - fftw_real(cinput[i]) = 0.0; - } + // append zeros + for (unsigned i = grid_n + 1; i <= grid_n * 3; i++) { + fftw_real(cinput[i]) = 0.0; + } - // now append the grid in reverse order - for (unsigned i = grid_n - 1, index = 0; i >= 1; i --, index ++) { - fftw_real(cinput[grid_n * 3 + 1 + index]) = grid[i]; - } + // now append the grid in reverse order + for (unsigned i = grid_n - 1, index = 0; i >= 1; i --, index ++) { + fftw_real(cinput[grid_n * 3 + 1 + index]) = grid[i]; + } #if 0 - std::stringstream logStr; - logStr << "ifft_in = ["; - for(unsigned i=0; i<grid_n*4; i++) { - logStr << fftw_real(cinput[i]) << " " << fftw_imag(cinput[i]); - if(i != grid_n*4-1) logStr << ", "; - } - logStr << "];"; - LOG_DEBUG(logStr.str()); + std::stringstream logStr; + logStr << "ifft_in = ["; + for(unsigned i=0; i<grid_n*4; i++) { + logStr << fftw_real(cinput[i]) << " " << fftw_imag(cinput[i]); + if(i != grid_n*4-1) logStr << ", "; + } + logStr << "];"; + LOG_DEBUG(logStr.str()); #endif #if defined HAVE_FFTW3 - fftwf_plan plan = fftwf_plan_dft_1d(grid_n * 4, cinput, coutput, FFTW_BACKWARD, FFTW_ESTIMATE); - fftwf_execute(plan); + fftwf_plan plan = fftwf_plan_dft_1d(grid_n * 4, cinput, coutput, FFTW_BACKWARD, FFTW_ESTIMATE); + fftwf_execute(plan); #elif defined HAVE_FFTW2 - fftw_plan plan = fftw_create_plan(grid_n * 4, FFTW_BACKWARD, FFTW_ESTIMATE); - fftw_one(plan, cinput, coutput); + fftw_plan plan = fftw_create_plan(grid_n * 4, FFTW_BACKWARD, FFTW_ESTIMATE); + fftw_one(plan, cinput, coutput); #endif #if 0 - for(unsigned i=0; i<grid_n*4; i++) { - LOG_DEBUG_STR("ifft result [" << i << "] = " << fftw_real(coutput[i]) << " " << fftw_imag(coutput[i])); - } + for(unsigned i=0; i<grid_n*4; i++) { + LOG_DEBUG_STR("ifft result [" << i << "] = " << fftw_real(coutput[i]) << " " << fftw_imag(coutput[i])); + } #endif - // half end - // 1 2 n+1 2(n+1) 3(n+1) 4(n+1) - // x x x x x x x x x # last quarter - // x x x x x x # first quarter + // half end + // 1 2 n+1 2(n+1) 3(n+1) 4(n+1) + // x x x x x x x x x # last quarter + // x x x x x x # first quarter - // last_quarter = b([end-n+1:2:end]); # the size is only 1/8, since we skip half of the elements - // first_quarter = b(2:2:(n+1)); # the size is only 1/8, since we skip half of the elements + // last_quarter = b([end-n+1:2:end]); # the size is only 1/8, since we skip half of the elements + // first_quarter = b(2:2:(n+1)); # the size is only 1/8, since we skip half of the elements - unsigned index = 0; + unsigned index = 0; - for (unsigned i = 4 * grid_n - n; i < 4 * grid_n; i += 2) { - result[index] = fftw_real(coutput[i]); - index++; - } + for (unsigned i = 4 * grid_n - n; i < 4 * grid_n; i += 2) { + result[index] = fftw_real(coutput[i]); + index++; + } - for (unsigned i = 1; i <= n; i += 2) { - result[index] = fftw_real(coutput[i]); - index++; - } + for (unsigned i = 1; i <= n; i += 2) { + result[index] = fftw_real(coutput[i]); + index++; + } #if defined HAVE_FFTW3 - fftwf_destroy_plan(plan); - fftwf_free(cinput); - fftwf_free(coutput); + fftwf_destroy_plan(plan); + fftwf_free(cinput); + fftwf_free(coutput); #elif defined HAVE_FFTW2 - fftw_destroy_plan(plan); - fftw_free(cinput); - fftw_free(coutput); + fftw_destroy_plan(plan); + fftw_free(cinput); + fftw_free(coutput); #endif - // multiply with window - for (unsigned i = 0; i <= n; i++) { - result[i] *= window[i]; - } + // multiply with window + for (unsigned i = 0; i <= n; i++) { + result[i] *= window[i]; + } - // normalize - double factor = result[n / 2]; - for (unsigned i = 0; i <= n; i++) { - result[i] /= factor; - } + // normalize + double factor = result[n / 2]; + for (unsigned i = 0; i <= n; i++) { + result[i] /= factor; + } #if 0 - std::stringstream logStr; - logStr << "result = ["; - for(unsigned i=0; i<=n; i++) { - logStr << result[i]; - if(i != n) logStr << ", "; - } - logStr << "];"; - LOG_DEBUG(logStr.str()); + std::stringstream logStr; + logStr << "result = ["; + for(unsigned i=0; i<=n; i++) { + logStr << result[i]; + if(i != n) logStr << ", "; + } + logStr << "];"; + LOG_DEBUG(logStr.str()); #endif + } } diff --git a/RTCP/GPUProc/src/NewCorrelator.cl b/RTCP/GPUProc/src/NewCorrelator.cl index 307c70f347743ea6201b6fd2a87fad59ad62812d..6d4e6ee4bc3782018a89d6287278253fc1a59c1e 100644 --- a/RTCP/GPUProc/src/NewCorrelator.cl +++ b/RTCP/GPUProc/src/NewCorrelator.cl @@ -1,72 +1,124 @@ #define NR_STATIONS_PER_BLOCK 32 -#define BLOCK_SIZE 8 +#define NR_TIMES_PER_BLOCK 8 #define NR_BASELINES (NR_STATIONS * (NR_STATIONS + 1) / 2) -typedef __global float (*CorrectedDataType)[NR_STATIONS][NR_CHANNELS][NR_SAMPLES_PER_CHANNEL][NR_POLARIZATIONS * 2]; +typedef __global float4 (*CorrectedDataType)[NR_STATIONS][NR_CHANNELS][NR_SAMPLES_PER_CHANNEL]; typedef __global float8 (*VisibilitiesType)[NR_BASELINES][NR_CHANNELS]; -#if 0 -__kernel void correlateTriangles(__global void *visibilitiesPtr, - __global const void *correctedDataPtr -) +__kernel +void correlateTriangleKernel(__global void *visibilitiesPtr, + __global const void *correctedDataPtr) { VisibilitiesType visibilities = (VisibilitiesType) visibilitiesPtr; CorrectedDataType correctedData = (CorrectedDataType) correctedDataPtr; - __local float4 samples[BLOCK_SIZE][NR_STATIONS_PER_BLOCK]; + __local float4 samples[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1]; + uint channel = get_global_id(2) + 1; + uint block = get_global_id(1); + +#if NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 + uint firstStation = block * NR_STATIONS_PER_BLOCK; + uint nrStationsThisBlock = NR_STATIONS_PER_BLOCK; +#else + uint lastStation = block * NR_STATIONS_PER_BLOCK + NR_STATIONS % NR_STATIONS_PER_BLOCK; + uint firstStation = block == 0 ? 0 : lastStation - NR_STATIONS_PER_BLOCK; + uint nrStationsThisBlock = lastStation - firstStation; +#endif - uint triangle = get_global_id(1); - uint channel = get_global_id(2); - uint firstStation = triangle * NR_STATIONS_PER_BLOCK; + uint miniBlock = get_local_id(0); + uint statXoffset = convert_uint_rtz(sqrt(convert_float(8 * miniBlock + 1)) - 0.99999f) / 2; + uint statYoffset = miniBlock - statXoffset * (statXoffset + 1) / 2; - float4 vis_0A_r = (float4) 0, vis_0A_i = (float4) 0; - float4 vis_0B_r = (float4) 0, vis_0B_i = (float4) 0; - float4 vis_1A_r = (float4) 0, vis_1A_i = (float4) 0; - float4 vis_1B_r = (float4) 0, vis_1B_i = (float4) 0; + statXoffset *= 2, statYoffset *= 2; - for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += BLOCK_SIZE) { + float4 vis_0A_r = 0, vis_0A_i = 0; + float4 vis_0B_r = 0, vis_0B_i = 0; + float4 vis_1A_r = 0, vis_1A_i = 0; + float4 vis_1B_r = 0, vis_1B_i = 0; + + bool doCorrelate = statXoffset < nrStationsThisBlock; + + for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += NR_TIMES_PER_BLOCK) { // load data into local memory -#pragma unroll 1 - for (uint i = get_local_id(0); i < BLOCK_SIZE * NR_STATIONS_PER_BLOCK; i += get_local_size(0)) { - uint time = i % BLOCK_SIZE; - uint stat = i / BLOCK_SIZE; + barrier(CLK_LOCAL_MEM_FENCE); - if (firstStation + stat < NR_STATIONS) - samples[time][stat] = (*correctedData)[firstStation + stat][channel][major + time]; + for (uint i = get_local_id(0); i < nrStationsThisBlock * NR_TIMES_PER_BLOCK; i += get_local_size(0)) { + uint time = i % NR_TIMES_PER_BLOCK; + uint stat = i / NR_TIMES_PER_BLOCK; + + samples[stat % 2][time][stat / 2] = (*correctedData)[firstStation + stat][channel][major + time]; } barrier(CLK_LOCAL_MEM_FENCE); - // compute auto-correlations +#pragma unroll 1 + for (uint time = 0; time < NR_TIMES_PER_BLOCK; time ++) { + float4 sample_0, sample_1, sample_A, sample_B; + + if (doCorrelate) { + sample_0 = samples[0][time][statYoffset / 2]; + sample_A = samples[0][time][statXoffset / 2]; + sample_B = samples[1][time][statXoffset / 2]; + sample_1 = samples[1][time][statYoffset / 2]; + + vis_0A_r += sample_0.xxzz * sample_A.xzxz; + vis_0A_i += sample_0.yyww * sample_A.xzxz; + vis_0B_r += sample_0.xxzz * sample_B.xzxz; + vis_0B_i += sample_0.yyww * sample_B.xzxz; + vis_1A_r += sample_1.xxzz * sample_A.xzxz; + vis_1A_i += sample_1.yyww * sample_A.xzxz; + vis_1B_r += sample_1.xxzz * sample_B.xzxz; + vis_1B_i += sample_1.yyww * sample_B.xzxz; - if (firstStation + get_local_id(0) < NR_STATIONS) { - for (time = 0; time + BLOCK_SIZE; time ++) { - float sample = samples[time][get_local_id(0)]; + vis_0A_r += sample_0.yyww * sample_A.ywyw; + vis_0A_i -= sample_0.xxzz * sample_A.ywyw; + vis_0B_r += sample_0.yyww * sample_B.ywyw; + vis_0B_i -= sample_0.xxzz * sample_B.ywyw; + vis_1A_r += sample_1.yyww * sample_A.ywyw; + vis_1A_i -= sample_1.xxzz * sample_A.ywyw; + vis_1B_r += sample_1.yyww * sample_B.ywyw; + vis_1B_i -= sample_1.xxzz * sample_B.ywyw; } + } + } + int statY = firstStation + statYoffset; + uint statX = firstStation + statXoffset; + uint baseline = (statX * (statX + 1) / 2) + statY; - barrier(CLK_LOCAL_MEM_FENCE); + if (statXoffset < nrStationsThisBlock) { + (*visibilities)[baseline ][channel].even = vis_0A_r; + (*visibilities)[baseline ][channel].odd = vis_0A_i; + } + + if (statXoffset < nrStationsThisBlock && statYoffset + 1 < nrStationsThisBlock) { + (*visibilities)[baseline + 1][channel].even = vis_1A_r; + (*visibilities)[baseline + 1][channel].odd = vis_1A_i; + } + + if (statXoffset + 1 < nrStationsThisBlock) { + (*visibilities)[baseline + statX + 1][channel].even = vis_0B_r; + (*visibilities)[baseline + statX + 1][channel].odd = vis_0B_i; + (*visibilities)[baseline + statX + 2][channel].even = vis_1B_r; + (*visibilities)[baseline + statX + 2][channel].odd = vis_1B_i; } } -#endif __kernel __attribute__((reqd_work_group_size(NR_STATIONS_PER_BLOCK * NR_STATIONS_PER_BLOCK / 4, 1, 1))) -void correlateRectangles(__global void *visibilitiesPtr, - __global const void *correctedDataPtr -) +void correlateRectangleKernel(__global void *visibilitiesPtr, + __global const void *correctedDataPtr) { - VisibilitiesType visibilities = (VisibilitiesType) visibilitiesPtr; + VisibilitiesType visibilities = (VisibilitiesType) visibilitiesPtr; CorrectedDataType correctedData = (CorrectedDataType) correctedDataPtr; - __local float4 samplesX[2][BLOCK_SIZE][NR_STATIONS_PER_BLOCK / 2 | 1]; - __local float4 samplesY[2][BLOCK_SIZE][NR_STATIONS_PER_BLOCK / 2 | 1]; + __local float4 samplesX[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1]; + __local float4 samplesY[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1]; uint block = get_global_id(1); - uint channel = get_global_id(2); uint blockX = convert_uint_rtz(sqrt(convert_float(8 * block + 1)) - 0.99999f) / 2; uint blockY = block - blockX * (blockX + 1) / 2; @@ -78,35 +130,291 @@ void correlateRectangles(__global void *visibilitiesPtr, int firstStationY = (blockY - 1) * NR_STATIONS_PER_BLOCK + NR_STATIONS % NR_STATIONS_PER_BLOCK; #endif - uint statXoffset = (get_local_id(0) / (NR_STATIONS_PER_BLOCK / 2)); - uint statYoffset = (get_local_id(0) % (NR_STATIONS_PER_BLOCK / 2)); + uint statXoffset = get_local_id(0) / (NR_STATIONS_PER_BLOCK / 2); + uint statYoffset = get_local_id(0) % (NR_STATIONS_PER_BLOCK / 2); + + float4 vis_0A_r = 0, vis_0A_i = 0; + float4 vis_0B_r = 0, vis_0B_i = 0; + float4 vis_1A_r = 0, vis_1A_i = 0; + float4 vis_1B_r = 0, vis_1B_i = 0; + + uint loadTime = get_local_id(0) % NR_TIMES_PER_BLOCK; + uint loadStat = get_local_id(0) / NR_TIMES_PER_BLOCK; + + bool doCorrelateLower = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + 2 * statYoffset) >= 0; + bool doCorrelateUpper = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + 2 * statYoffset) >= -1; + bool doLoadY = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + loadStat) >= 0; + + uint channel = get_global_id(2) + 1; + + for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += NR_TIMES_PER_BLOCK) { + // load data into local memory + float4 sampleX = (*correctedData)[firstStationX + loadStat][channel][major + loadTime]; + float4 sampleY; + + if (doLoadY) + sampleY = (*correctedData)[firstStationY + loadStat][channel][major + loadTime]; + + barrier(CLK_LOCAL_MEM_FENCE); + + samplesX[loadStat % 2][loadTime][loadStat / 2] = sampleX; + + if (doLoadY) + samplesY[loadStat % 2][loadTime][loadStat / 2] = sampleY; + + barrier(CLK_LOCAL_MEM_FENCE); + +#pragma unroll 1 + for (uint time = 0; time < NR_TIMES_PER_BLOCK; time ++) { + float4 sample_0, sample_1, sample_A, sample_B; + + if (doCorrelateLower) { + sample_0 = samplesY[0][time][statYoffset]; + } + + if (doCorrelateUpper) { + sample_A = samplesX[0][time][statXoffset]; + sample_B = samplesX[1][time][statXoffset]; + sample_1 = samplesY[1][time][statYoffset]; + } + + if (doCorrelateLower) { + vis_0A_r += sample_0.xxzz * sample_A.xzxz; + vis_0A_i += sample_0.yyww * sample_A.xzxz; + vis_0B_r += sample_0.xxzz * sample_B.xzxz; + vis_0B_i += sample_0.yyww * sample_B.xzxz; + vis_0A_r += sample_0.yyww * sample_A.ywyw; + vis_0A_i -= sample_0.xxzz * sample_A.ywyw; + vis_0B_r += sample_0.yyww * sample_B.ywyw; + vis_0B_i -= sample_0.xxzz * sample_B.ywyw; + } + + if (doCorrelateUpper) { + vis_1A_r += sample_1.xxzz * sample_A.xzxz; + vis_1A_i += sample_1.yyww * sample_A.xzxz; + vis_1B_r += sample_1.xxzz * sample_B.xzxz; + vis_1B_i += sample_1.yyww * sample_B.xzxz; + vis_1A_r += sample_1.yyww * sample_A.ywyw; + vis_1A_i -= sample_1.xxzz * sample_A.ywyw; + vis_1B_r += sample_1.yyww * sample_B.ywyw; + vis_1B_i -= sample_1.xxzz * sample_B.ywyw; + } + } + } + + int statY = firstStationY + 2 * statYoffset; + uint statX = firstStationX + 2 * statXoffset; + uint baseline = (statX * (statX + 1) / 2) + statY; + + if (doCorrelateLower) { + (*visibilities)[baseline ][channel].even = vis_0A_r; + (*visibilities)[baseline ][channel].odd = vis_0A_i; + (*visibilities)[baseline + statX + 1][channel].even = vis_0B_r; + (*visibilities)[baseline + statX + 1][channel].odd = vis_0B_i; + } + + if (doCorrelateUpper) { + (*visibilities)[baseline + 1][channel].even = vis_1A_r; + (*visibilities)[baseline + 1][channel].odd = vis_1A_i; + (*visibilities)[baseline + statX + 2][channel].even = vis_1B_r; + (*visibilities)[baseline + statX + 2][channel].odd = vis_1B_i; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +void correlateTriangle(VisibilitiesType visibilities, + CorrectedDataType correctedData, + __local float4 samples[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1], + uint block) +{ + uint channel = get_global_id(2) + 1; + +#if NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 + uint firstStation = block * NR_STATIONS_PER_BLOCK; +#else + int firstStation = (block - 1) * NR_STATIONS_PER_BLOCK + NR_STATIONS % NR_STATIONS_PER_BLOCK; +#endif + + bool doCorrelate = false, doAutoCorrelate = false, doNearAutoCorrelate = false; + uint statXoffset, statYoffset; + + if (get_local_id(0) < 128) { + uint miniBlock = get_local_id(0); + uint miniBlockX = convert_uint_rtz(sqrt(convert_float(8 * miniBlock + 1)) - 0.99999f) / 2; + uint miniBlockY = miniBlock - miniBlockX * (miniBlockX + 1) / 2; + + statXoffset = 2 * miniBlockX + 2; + statYoffset = 2 * miniBlockY; + doCorrelate = statXoffset < NR_STATIONS_PER_BLOCK && (NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStation + statYoffset) >= 0); + } else if (get_local_id(0) < 128 + NR_STATIONS_PER_BLOCK / 2) { + statXoffset = statYoffset = 2 * (get_local_id(0) - 128); + // actually, it is the visibility one right of statXoffset + doNearAutoCorrelate = (int) (firstStation + statXoffset) >= 0; + } else if (get_local_id(0) >= 192 && get_local_id(0) < 192 + NR_STATIONS_PER_BLOCK) { + statXoffset = statYoffset = get_local_id(0) - 192; + doAutoCorrelate = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStation + statYoffset) >= 0; + } float4 vis_0A_r = 0, vis_0A_i = 0; float4 vis_0B_r = 0, vis_0B_i = 0; float4 vis_1A_r = 0, vis_1A_i = 0; float4 vis_1B_r = 0, vis_1B_i = 0; - for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += BLOCK_SIZE) { + uint loadTime = get_local_id(0) % NR_TIMES_PER_BLOCK; + uint loadStat = get_local_id(0) / NR_TIMES_PER_BLOCK; + + bool doLoad = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStation + loadStat) >= 0; + + for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += NR_TIMES_PER_BLOCK) { // load data into local memory - for (uint i = get_local_id(0); i < 4 * BLOCK_SIZE * NR_STATIONS_PER_BLOCK; i += NR_STATIONS_PER_BLOCK * NR_STATIONS_PER_BLOCK / 4) { - uint p = i % 4; - uint time = i / 4 % BLOCK_SIZE; - uint stat = i / 4 / BLOCK_SIZE; + float4 sample; + + if (doLoad) + sample = (*correctedData)[firstStation + loadStat][channel][major + loadTime]; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (doLoad) + samples[loadStat % 2][loadTime][loadStat / 2] = sample; + + barrier(CLK_LOCAL_MEM_FENCE); + +#pragma unroll 1 + for (uint time = 0; time < NR_TIMES_PER_BLOCK; time ++) { + if (doCorrelate) { + float4 sample_0 = samples[0][time][statYoffset / 2]; + float4 sample_A = samples[0][time][statXoffset / 2]; + float4 sample_B = samples[1][time][statXoffset / 2]; + float4 sample_1 = samples[1][time][statYoffset / 2]; + + vis_0A_r += sample_0.xxzz * sample_A.xzxz; + vis_0A_i += sample_0.yyww * sample_A.xzxz; + vis_0B_r += sample_0.xxzz * sample_B.xzxz; + vis_0B_i += sample_0.yyww * sample_B.xzxz; + vis_0A_r += sample_0.yyww * sample_A.ywyw; + vis_0A_i -= sample_0.xxzz * sample_A.ywyw; + vis_0B_r += sample_0.yyww * sample_B.ywyw; + vis_0B_i -= sample_0.xxzz * sample_B.ywyw; + + vis_1A_r += sample_1.xxzz * sample_A.xzxz; + vis_1A_i += sample_1.yyww * sample_A.xzxz; + vis_1B_r += sample_1.xxzz * sample_B.xzxz; + vis_1B_i += sample_1.yyww * sample_B.xzxz; + vis_1A_r += sample_1.yyww * sample_A.ywyw; + vis_1A_i -= sample_1.xxzz * sample_A.ywyw; + vis_1B_r += sample_1.yyww * sample_B.ywyw; + vis_1B_i -= sample_1.xxzz * sample_B.ywyw; + } - ((__local float *) &samplesX[stat % 2][time][stat / 2])[p] = (*correctedData)[firstStationX + stat][channel][major + time][p]; + if (doAutoCorrelate) { + float4 sample_0 = samples[statYoffset % 2][time][statYoffset / 2]; + vis_0A_r.xyw += sample_0.xxz * sample_0.xzz; + vis_0A_i.y += sample_0.y * sample_0.z; + vis_0A_r.xyw += sample_0.yyw * sample_0.yww; + vis_0A_i.y -= sample_0.x * sample_0.w; + } - if (NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + stat) >= 0) - ((__local float *) &samplesY[stat % 2][time][stat / 2])[p] = (*correctedData)[firstStationY + stat][channel][major + time][p]; + if (doNearAutoCorrelate) { + float4 sample_0 = samples[0][time][statYoffset / 2]; + float4 sample_B = samples[1][time][statXoffset / 2]; + vis_0B_r += sample_0.xxzz * sample_B.xzxz; + vis_0B_i += sample_0.yyww * sample_B.xzxz; + vis_0B_r += sample_0.yyww * sample_B.ywyw; + vis_0B_i -= sample_0.xxzz * sample_B.ywyw; + } } + } + + if (doAutoCorrelate) { + vis_0A_r.z = vis_0A_r.y; + vis_0A_i.z = -vis_0A_i.y; + } + + int statY = firstStation + statYoffset; + uint statX = firstStation + statXoffset; + uint baseline = (statX * (statX + 1) / 2) + statY; + + if (doCorrelate || doAutoCorrelate) { + (*visibilities)[baseline ][channel].even = vis_0A_r; + (*visibilities)[baseline ][channel].odd = vis_0A_i; + } + + if (doCorrelate || doNearAutoCorrelate) { + (*visibilities)[baseline + statX + 1][channel].even = vis_0B_r; + (*visibilities)[baseline + statX + 1][channel].odd = vis_0B_i; + } + + if (doCorrelate) { + (*visibilities)[baseline + 1][channel].even = vis_1A_r; + (*visibilities)[baseline + 1][channel].odd = vis_1A_i; + (*visibilities)[baseline + statX + 2][channel].even = vis_1B_r; + (*visibilities)[baseline + statX + 2][channel].odd = vis_1B_i; + } +} + + +void correlateTriangle2(VisibilitiesType visibilities, + CorrectedDataType correctedData, + __local float4 samples[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1], + uint block +) +{ + uint channel = get_global_id(2) + 1; + +#if NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 + uint firstStation = block * NR_STATIONS_PER_BLOCK; + //uint lastStation = firstStation + NR_STATIONS_PER_BLOCK; + uint nrStationsThisBlock = NR_STATIONS_PER_BLOCK; +#else + uint lastStation = block * NR_STATIONS_PER_BLOCK + NR_STATIONS % NR_STATIONS_PER_BLOCK; + uint firstStation = block == 0 ? 0 : lastStation - NR_STATIONS_PER_BLOCK; + uint nrStationsThisBlock = lastStation - firstStation; +#endif + + uint miniBlock = get_local_id(0); + uint statXoffset = convert_uint_rtz(sqrt(convert_float(8 * miniBlock + 1)) - 0.99999f) / 2; + uint statYoffset = miniBlock - statXoffset * (statXoffset + 1) / 2; + + statXoffset *= 2, statYoffset *= 2; + + //bool doCorrelate = statXoffset < nrStationsThisBlock; + + float4 vis_0A_r = 0, vis_0A_i = 0; + float4 vis_0B_r = 0, vis_0B_i = 0; + float4 vis_1A_r = 0, vis_1A_i = 0; + float4 vis_1B_r = 0, vis_1B_i = 0; + + uint loadTime = get_local_id(0) % NR_TIMES_PER_BLOCK; + uint loadStat = get_local_id(0) / NR_TIMES_PER_BLOCK; + + bool doCorrelateLeft = statXoffset < nrStationsThisBlock; + //bool doCorrelateRight = statXoffset + 1 < nrStationsThisBlock; + bool doLoad = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || loadStat < nrStationsThisBlock; + + for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += NR_TIMES_PER_BLOCK) { + // load data into local memory + float4 sample; + + if (doLoad) + sample = (*correctedData)[firstStation + loadStat][channel][major + loadTime]; barrier(CLK_LOCAL_MEM_FENCE); - if (NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + 2 * statYoffset) >= -1) { - for (uint time = 0; time < BLOCK_SIZE; time ++) { - float4 sample_0 = samplesY[0][time][statYoffset]; - float4 sample_A = samplesX[0][time][statXoffset]; - float4 sample_B = samplesX[1][time][statXoffset]; - float4 sample_1 = samplesY[1][time][statYoffset]; + if (doLoad) + samples[loadStat % 2][loadTime][loadStat / 2] = sample; + + barrier(CLK_LOCAL_MEM_FENCE); + +#pragma unroll 1 + for (uint time = 0; time < NR_TIMES_PER_BLOCK; time ++) { + float4 sample_0, sample_1, sample_A, sample_B; + + if (doCorrelateLeft) { + sample_0 = samples[0][time][statYoffset / 2]; + sample_A = samples[0][time][statXoffset / 2]; + sample_B = samples[1][time][statXoffset / 2]; + sample_1 = samples[1][time][statYoffset / 2]; vis_0A_r += sample_0.xxzz * sample_A.xzxz; vis_0A_i += sample_0.yyww * sample_A.xzxz; @@ -127,25 +435,152 @@ void correlateRectangles(__global void *visibilitiesPtr, vis_1B_i -= sample_1.xxzz * sample_B.ywyw; } } + } + + int statY = firstStation + statYoffset; + uint statX = firstStation + statXoffset; + uint baseline = (statX * (statX + 1) / 2) + statY; + + if (statXoffset < nrStationsThisBlock) { + (*visibilities)[baseline ][channel].even = vis_0A_r; + (*visibilities)[baseline ][channel].odd = vis_0A_i; + } + + if (statXoffset < nrStationsThisBlock && statYoffset + 1 < nrStationsThisBlock) { + (*visibilities)[baseline + 1][channel].even = vis_1A_r; + (*visibilities)[baseline + 1][channel].odd = vis_1A_i; + } + + if (statXoffset + 1 < nrStationsThisBlock) { + (*visibilities)[baseline + statX + 1][channel].even = vis_0B_r; + (*visibilities)[baseline + statX + 1][channel].odd = vis_0B_i; + (*visibilities)[baseline + statX + 2][channel].even = vis_1B_r; + (*visibilities)[baseline + statX + 2][channel].odd = vis_1B_i; + } +} + + +void correlateRectangle(VisibilitiesType visibilities, + CorrectedDataType correctedData, + __local float4 samplesX[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1], + __local float4 samplesY[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1], + uint blockX, + uint blockY +) +{ + uint channel = get_global_id(2) + 1; + +#if NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 + uint firstStationX = blockX * NR_STATIONS_PER_BLOCK; + uint firstStationY = blockY * NR_STATIONS_PER_BLOCK; +#else + uint firstStationX = (blockX - 1) * NR_STATIONS_PER_BLOCK + NR_STATIONS % NR_STATIONS_PER_BLOCK; + int firstStationY = (blockY - 1) * NR_STATIONS_PER_BLOCK + NR_STATIONS % NR_STATIONS_PER_BLOCK; +#endif + + uint statXoffset = get_local_id(0) / (NR_STATIONS_PER_BLOCK / 2); + uint statYoffset = get_local_id(0) % (NR_STATIONS_PER_BLOCK / 2); + + float4 vis_0A_r = 0, vis_0A_i = 0; + float4 vis_0B_r = 0, vis_0B_i = 0; + float4 vis_1A_r = 0, vis_1A_i = 0; + float4 vis_1B_r = 0, vis_1B_i = 0; + + uint loadTime = get_local_id(0) % NR_TIMES_PER_BLOCK; + uint loadStat = get_local_id(0) / NR_TIMES_PER_BLOCK; + + bool doCorrelateLower = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + 2 * statYoffset) >= 0; + bool doCorrelateUpper = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + 2 * statYoffset) >= -1; + bool doLoadY = NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || (int) (firstStationY + loadStat) >= 0; + + for (uint major = 0; major < NR_SAMPLES_PER_CHANNEL; major += NR_TIMES_PER_BLOCK) { + // load data into local memory + float4 sampleX = (*correctedData)[firstStationX + loadStat][channel][major + loadTime]; + float4 sampleY; + + if (doLoadY) + sampleY = (*correctedData)[firstStationY + loadStat][channel][major + loadTime]; barrier(CLK_LOCAL_MEM_FENCE); + + samplesX[loadStat % 2][loadTime][loadStat / 2] = sampleX; + + if (doLoadY) + samplesY[loadStat % 2][loadTime][loadStat / 2] = sampleY; + + barrier(CLK_LOCAL_MEM_FENCE); + +#pragma unroll 1 + for (uint time = 0; time < NR_TIMES_PER_BLOCK; time ++) { + float4 sample_0, sample_1, sample_A, sample_B; + + if (doCorrelateLower) { + sample_0 = samplesY[0][time][statYoffset]; + } + + if (doCorrelateUpper) { + sample_A = samplesX[0][time][statXoffset]; + sample_B = samplesX[1][time][statXoffset]; + sample_1 = samplesY[1][time][statYoffset]; + } + + if (doCorrelateLower) { + vis_0A_r += sample_0.xxzz * sample_A.xzxz; + vis_0A_i += sample_0.yyww * sample_A.xzxz; + vis_0B_r += sample_0.xxzz * sample_B.xzxz; + vis_0B_i += sample_0.yyww * sample_B.xzxz; + vis_0A_r += sample_0.yyww * sample_A.ywyw; + vis_0A_i -= sample_0.xxzz * sample_A.ywyw; + vis_0B_r += sample_0.yyww * sample_B.ywyw; + vis_0B_i -= sample_0.xxzz * sample_B.ywyw; + } + + if (doCorrelateUpper) { + vis_1A_r += sample_1.xxzz * sample_A.xzxz; + vis_1A_i += sample_1.yyww * sample_A.xzxz; + vis_1B_r += sample_1.xxzz * sample_B.xzxz; + vis_1B_i += sample_1.yyww * sample_B.xzxz; + vis_1A_r += sample_1.yyww * sample_A.ywyw; + vis_1A_i -= sample_1.xxzz * sample_A.ywyw; + vis_1B_r += sample_1.yyww * sample_B.ywyw; + vis_1B_i -= sample_1.xxzz * sample_B.ywyw; + } + } } int statY = firstStationY + 2 * statYoffset; uint statX = firstStationX + 2 * statXoffset; uint baseline = (statX * (statX + 1) / 2) + statY; - if (NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || statY >= 0) { - (*visibilities)[baseline ][channel].even = vis_0A_r; - (*visibilities)[baseline ][channel].odd = vis_0A_i; - (*visibilities)[baseline + 1][channel].even = vis_1A_r; - (*visibilities)[baseline + 1][channel].odd = vis_1A_i; - } - - if (NR_STATIONS % NR_STATIONS_PER_BLOCK == 0 || statY >= -1) { + if (doCorrelateLower) { + (*visibilities)[baseline ][channel].even = vis_0A_r; + (*visibilities)[baseline ][channel].odd = vis_0A_i; (*visibilities)[baseline + statX + 1][channel].even = vis_0B_r; (*visibilities)[baseline + statX + 1][channel].odd = vis_0B_i; + } + + if (doCorrelateUpper) { + (*visibilities)[baseline + 1][channel].even = vis_1A_r; + (*visibilities)[baseline + 1][channel].odd = vis_1A_i; (*visibilities)[baseline + statX + 2][channel].even = vis_1B_r; (*visibilities)[baseline + statX + 2][channel].odd = vis_1B_i; } } + + +__kernel __attribute__((reqd_work_group_size(NR_STATIONS_PER_BLOCK * NR_STATIONS_PER_BLOCK / 4, 1, 1))) +void correlate(__global void *visibilitiesPtr, + __global const void *correctedDataPtr) +{ + __local float4 samplesX[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1]; + __local float4 samplesY[2][NR_TIMES_PER_BLOCK][NR_STATIONS_PER_BLOCK / 2 | 1]; + + uint block = get_global_id(1); + uint blockX = convert_uint_rtz(sqrt(convert_float(8 * block + 1)) - 0.99999f) / 2; + uint blockY = block - blockX * (blockX + 1) / 2; + + if (blockX == blockY) + correlateTriangle2((VisibilitiesType) visibilitiesPtr, (CorrectedDataType) correctedDataPtr, samplesX, blockX); + else + correlateRectangle((VisibilitiesType) visibilitiesPtr, (CorrectedDataType) correctedDataPtr, samplesX, samplesY, blockX, blockY); +} diff --git a/RTCP/GPUProc/src/NewCorrelator.cl-0.ptx b/RTCP/GPUProc/src/NewCorrelator.cl-0.ptx deleted file mode 100644 index c69f71f47ba5748d5cb342ecbed29c627f766f0f..0000000000000000000000000000000000000000 Binary files a/RTCP/GPUProc/src/NewCorrelator.cl-0.ptx and /dev/null differ diff --git a/RTCP/GPUProc/src/RTCP.cc b/RTCP/GPUProc/src/RTCP.cc index 0e6c8b99b6746c48388c67fd96eb40a36d5e0329..9c6a8830cdc4fcc45bba57c306ff434c9b3f56e9 100644 --- a/RTCP/GPUProc/src/RTCP.cc +++ b/RTCP/GPUProc/src/RTCP.cc @@ -14,11 +14,11 @@ #include <iostream> #include <sstream> #include <boost/multi_array.hpp> +#include <boost/date_time/posix_time/posix_time.hpp> #include "Align.h" -#include "BandPass.h" #include "ApplCommon/PosixTime.h" -#include <boost/date_time/posix_time/posix_time.hpp> +#include "BandPass.h" #include "Common/LofarLogger.h" #include "Common/SystemUtil.h" #include "FilterBank.h" @@ -30,6 +30,11 @@ #include "UHEP/InvertedStationPPFWeights.h" //#include "clAmdFft/include/clAmdFft.h" +#if defined __linux__ +#include <sched.h> +#include <sys/time.h> +#endif + namespace LOFAR { namespace RTCP { @@ -41,9 +46,7 @@ unsigned nrGPUs; #define NR_TAPS 16 #define NR_STATION_FILTER_TAPS 16 -// the SAP to process (we support only one SAP for now) -#define SAP 0 - +#undef USE_NEW_CORRELATOR #define USE_2X2 #undef USE_CUSTOM_FFT #undef USE_TEST_DATA @@ -81,6 +84,40 @@ double getTime() return now - firstTime; } +#if defined __linux__ + +inline void set_affinity(unsigned device) +{ +#if 0 + static const char mapping[1][12] = { + 0, 1, 2, 3, 8, 9, 10, 11, + }; +#else + static const char mapping[8][12] = { + { 0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17, }, + { 0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17, }, + { 0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17, }, + { 0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17, }, + { 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, }, + { 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, }, + { 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, }, + { 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, }, + }; +#endif + + cpu_set_t set; + + CPU_ZERO(&set); + + for (unsigned coreIndex = 0; coreIndex < 12; coreIndex ++) + CPU_SET(mapping[device][coreIndex], &set); + + if (sched_setaffinity(0, sizeof set, &set) < 0) + perror("sched_setaffinity"); +} + +#endif + class PerformanceCounter { @@ -200,7 +237,7 @@ cl::Program createProgram(const Parset &ps, cl::Context &context, std::vector<cl args << " -DNR_SAMPLES_PER_CHANNEL=" << ps.nrSamplesPerChannel(); args << " -DNR_SAMPLES_PER_SUBBAND=" << ps.nrSamplesPerSubband(); args << " -DNR_BEAMS=" << ps.nrBeams(); - args << " -DNR_TABS=" << ps.nrTABs(SAP); + args << " -DNR_TABS=" << ps.nrTABs(0); args << " -DNR_COHERENT_STOKES=" << ps.nrCoherentStokes(); args << " -DNR_INCOHERENT_STOKES=" << ps.nrIncoherentStokes(); args << " -DCOHERENT_STOKES_TIME_INTEGRATION_FACTOR=" << ps.coherentStokesTimeIntegrationFactor(); @@ -257,6 +294,10 @@ class Pipeline SmartPtr<InputSection<i16complex> > inputSection16; SmartPtr<InputSection<i8complex> > inputSection8; SmartPtr<InputSection<i4complex> > inputSection4; + +#if defined USE_B7015 + OMP_Lock hostToDeviceLock[4], deviceToHostLock[4]; +#endif }; @@ -273,12 +314,12 @@ class CorrelatorPipeline : public Pipeline FilterBank filterBank; cl::Program firFilterProgram, delayAndBandPassProgram, correlatorProgram; +#if defined USE_NEW_CORRELATOR + PerformanceCounter firFilterCounter, delayAndBandPassCounter, correlateTriangleCounter, correlateRectangleCounter, fftCounter; +#else PerformanceCounter firFilterCounter, delayAndBandPassCounter, correlatorCounter, fftCounter; - PerformanceCounter samplesCounter, visibilitiesCounter; - -#if defined USE_B7015 - OMP_Lock hostToDeviceLock[4], deviceToHostLock[4]; #endif + PerformanceCounter samplesCounter, visibilitiesCounter; }; @@ -332,7 +373,12 @@ CorrelatorPipeline::CorrelatorPipeline(const Parset &ps) filterBank(true, NR_TAPS, ps.nrChannelsPerSubband(), KAISER), firFilterCounter("FIR filter"), delayAndBandPassCounter("delay/bp"), +#if defined USE_NEW_CORRELATOR + correlateTriangleCounter("cor.triangle"), + correlateRectangleCounter("cor.rectangle"), +#else correlatorCounter("correlator"), +#endif fftCounter("FFT"), samplesCounter("samples"), visibilitiesCounter("visibilities") @@ -348,8 +394,11 @@ CorrelatorPipeline::CorrelatorPipeline(const Parset &ps) #pragma omp section delayAndBandPassProgram = createProgram("DelayAndBandPass.cl"); #pragma omp section +#if defined USE_NEW_CORRELATOR correlatorProgram = createProgram("NewCorrelator.cl"); - //correlatorProgram = createProgram("Correlator.cl"); +#else + correlatorProgram = createProgram("Correlator.cl"); +#endif } std::cout << "compile time = " << getTime() - startTime << std::endl; @@ -666,7 +715,7 @@ class DelayAndBandPassKernel : public Kernel }; -#if 0 +#if !defined USE_NEW_CORRELATOR class CorrelatorKernel : public Kernel { @@ -714,12 +763,13 @@ class CorrelatorKernel : public Kernel nrThreads = (nrThreads + preferredMultiple - 1) / preferredMultiple * preferredMultiple; //std::cout << "nrBlocks = " << nrBlocks << ", nrPasses = " << nrPasses << ", preferredMultiple = " << preferredMultiple << ", nrThreads = " << nrThreads << std::endl; - globalWorkSize = cl::NDRange(nrPasses * nrThreads, ps.nrChannelsPerSubband()); + unsigned nrUsableChannels = std::max(ps.nrChannelsPerSubband() - 1, 1U); + globalWorkSize = cl::NDRange(nrPasses * nrThreads, nrUsableChannels); localWorkSize = cl::NDRange(nrThreads, 1); - nrOperations = (size_t) ps.nrChannelsPerSubband() * ps.nrBaselines() * ps.nrSamplesPerChannel() * 32; - nrBytesRead = (size_t) nrPasses * ps.nrStations() * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>); - nrBytesWritten = (size_t) ps.nrBaselines() * ps.nrChannelsPerSubband() * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>); + nrOperations = (size_t) nrUsableChannels * ps.nrBaselines() * ps.nrSamplesPerChannel() * 32; + nrBytesRead = (size_t) nrPasses * ps.nrStations() * nrUsableChannels * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>); + nrBytesWritten = (size_t) ps.nrBaselines() * nrUsableChannels * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>); } }; @@ -731,7 +781,7 @@ class CorrelatorKernel : public Kernel CorrelatorKernel(const Parset &ps, cl::CommandQueue &queue, cl::Program &program, cl::Buffer &devVisibilities, cl::Buffer &devCorrectedData) : #if defined USE_2X2 - Kernel(ps, program, "correlateRectangles") + Kernel(ps, program, "correlate") #else #error not implemented #endif @@ -739,17 +789,90 @@ class CorrelatorKernel : public Kernel setArg(0, devVisibilities); setArg(1, devCorrectedData); - unsigned nrRectanglesPerSide = ((ps.nrStations() - 1) / (2 * 16)); + unsigned nrRectanglesPerSide = (ps.nrStations() - 1) / (2 * 16); + unsigned nrRectangles = nrRectanglesPerSide * (nrRectanglesPerSide + 1) / 2; +//#pragma omp critical (cout) + //std::cout << "nrRectangles = " << nrRectangles << std::endl; + + unsigned nrBlocksPerSide = (ps.nrStations() + 2 * 16 - 1) / (2 * 16); + unsigned nrBlocks = nrBlocksPerSide * (nrBlocksPerSide + 1) / 2; +//#pragma omp critical (cout) + //std::cout << "nrBlocks = " << nrBlocks << std::endl; + + unsigned nrUsableChannels = std::max(ps.nrChannelsPerSubband() - 1, 1U); + globalWorkSize = cl::NDRange(16 * 16, nrBlocks, nrUsableChannels); + localWorkSize = cl::NDRange(16 * 16, 1, 1); + + // FIXME + //nrOperations = (size_t) (32 * 32) * nrRectangles * nrUsableChannels * ps.nrSamplesPerChannel() * 32; + nrOperations = (size_t) ps.nrBaselines() * ps.nrSamplesPerSubband() * 32; + nrBytesRead = (size_t) (32 + 32) * nrRectangles * nrUsableChannels * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>); + nrBytesWritten = (size_t) (32 * 32) * nrRectangles * nrUsableChannels * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>); + } +}; + + +class CorrelateRectangleKernel : public Kernel +{ + public: + CorrelateRectangleKernel(const Parset &ps, cl::CommandQueue &queue, cl::Program &program, cl::Buffer &devVisibilities, cl::Buffer &devCorrectedData) + : +#if defined USE_2X2 + Kernel(ps, program, "correlateRectangleKernel") +#else +#error not implemented +#endif + { + setArg(0, devVisibilities); + setArg(1, devCorrectedData); + + unsigned nrRectanglesPerSide = (ps.nrStations() - 1) / (2 * 16); unsigned nrRectangles = nrRectanglesPerSide * (nrRectanglesPerSide + 1) / 2; #pragma omp critical (cout) std::cout << "nrRectangles = " << nrRectangles << std::endl; - globalWorkSize = cl::NDRange(16 * 16, nrRectangles, ps.nrChannelsPerSubband()); + unsigned nrUsableChannels = std::max(ps.nrChannelsPerSubband() - 1, 1U); + globalWorkSize = cl::NDRange(16 * 16, nrRectangles, nrUsableChannels); localWorkSize = cl::NDRange(16 * 16, 1, 1); - nrOperations = (size_t) (32 * 32) * nrRectangles * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * 32; - nrBytesRead = (size_t) (32 + 32) * nrRectangles * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>); - nrBytesWritten = (size_t) (32 * 32) * nrRectangles * ps.nrChannelsPerSubband() * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>); + nrOperations = (size_t) (32 * 32) * nrRectangles * nrUsableChannels * ps.nrSamplesPerChannel() * 32; + nrBytesRead = (size_t) (32 + 32) * nrRectangles * nrUsableChannels * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>); + nrBytesWritten = (size_t) (32 * 32) * nrRectangles * nrUsableChannels * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>); + } +}; + + +class CorrelateTriangleKernel : public Kernel +{ + public: + CorrelateTriangleKernel(const Parset &ps, cl::CommandQueue &queue, cl::Program &program, cl::Buffer &devVisibilities, cl::Buffer &devCorrectedData) + : +#if defined USE_2X2 + Kernel(ps, program, "correlateTriangleKernel") +#else +#error not implemented +#endif + { + setArg(0, devVisibilities); + setArg(1, devCorrectedData); + + unsigned nrTriangles = (ps.nrStations() + 2 * 16 - 1) / (2 * 16); + unsigned nrMiniBlocksPerSide = 16; + unsigned nrMiniBlocks = nrMiniBlocksPerSide * (nrMiniBlocksPerSide + 1) / 2; + size_t preferredMultiple; + getWorkGroupInfo(queue.getInfo<CL_QUEUE_DEVICE>(), CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &preferredMultiple); + unsigned nrThreads = align(nrMiniBlocks, preferredMultiple); + +#pragma omp critical (cout) + std::cout << "nrTriangles = " << nrTriangles << ", nrMiniBlocks = " << nrMiniBlocks << ", nrThreads = " << nrThreads << std::endl; + + unsigned nrUsableChannels = std::max(ps.nrChannelsPerSubband() - 1, 1U); + globalWorkSize = cl::NDRange(nrThreads, nrTriangles, nrUsableChannels); + localWorkSize = cl::NDRange(nrThreads, 1, 1); + + nrOperations = (size_t) (32 * 32 / 2) * nrTriangles * nrUsableChannels * ps.nrSamplesPerChannel() * 32; + nrBytesRead = (size_t) 32 * nrTriangles * nrUsableChannels * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>); + nrBytesWritten = (size_t) (32 * 32 / 2) * nrTriangles * nrUsableChannels * NR_POLARIZATIONS * NR_POLARIZATIONS * sizeof(std::complex<float>); } }; @@ -815,18 +938,18 @@ class BeamFormerKernel : public Kernel setArg(1, devCorrectedData); setArg(2, devBeamFormerWeights); - globalWorkSize = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(SAP), ps.nrChannelsPerSubband()); - localWorkSize = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(SAP), 1); + globalWorkSize = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(0), ps.nrChannelsPerSubband()); + localWorkSize = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(0), 1); // FIXME: nrTABs - //queue.enqueueNDRangeKernel(*this, cl::NullRange, cl::NDRange(16, ps.nrTABs(SAP), ps.nrChannelsPerSubband()), cl::NDRange(16, ps.nrTABs(SAP), 1), 0, &event); + //queue.enqueueNDRangeKernel(*this, cl::NullRange, cl::NDRange(16, ps.nrTABs(0), ps.nrChannelsPerSubband()), cl::NDRange(16, ps.nrTABs(0), 1), 0, &event); size_t count = ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * NR_POLARIZATIONS; - size_t nrWeightsBytes = ps.nrStations() * ps.nrTABs(SAP) * ps.nrChannelsPerSubband() * NR_POLARIZATIONS * sizeof(std::complex<float>); + size_t nrWeightsBytes = ps.nrStations() * ps.nrTABs(0) * ps.nrChannelsPerSubband() * NR_POLARIZATIONS * sizeof(std::complex<float>); size_t nrSampleBytesPerPass = count * ps.nrStations() * sizeof(std::complex<float>); - size_t nrComplexVoltagesBytesPerPass = count * ps.nrTABs(SAP) * sizeof(std::complex<float>); + size_t nrComplexVoltagesBytesPerPass = count * ps.nrTABs(0) * sizeof(std::complex<float>); unsigned nrPasses = std::max((ps.nrStations() + 6) / 16, 1U); - nrOperations = count * ps.nrStations() * ps.nrTABs(SAP) * 8; + nrOperations = count * ps.nrStations() * ps.nrTABs(0) * 8; nrBytesRead = nrWeightsBytes + nrSampleBytesPerPass + (nrPasses - 1) * nrComplexVoltagesBytesPerPass; nrBytesWritten = nrPasses * nrComplexVoltagesBytesPerPass; } @@ -844,14 +967,14 @@ class BeamFormerTransposeKernel : public Kernel setArg(0, devTransposedData); setArg(1, devComplexVoltages); - //globalWorkSize = cl::NDRange(256, (ps.nrTABs(SAP) + 15) / 16, (ps.nrChannelsPerSubband() + 15) / 16); - globalWorkSize = cl::NDRange(256, (ps.nrTABs(SAP) + 15) / 16, ps.nrSamplesPerChannel() / 16); + //globalWorkSize = cl::NDRange(256, (ps.nrTABs(0) + 15) / 16, (ps.nrChannelsPerSubband() + 15) / 16); + globalWorkSize = cl::NDRange(256, (ps.nrTABs(0) + 15) / 16, ps.nrSamplesPerChannel() / 16); localWorkSize = cl::NDRange(256, 1, 1); nrOperations = 0; - nrBytesRead = (size_t) ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(SAP) * NR_POLARIZATIONS * sizeof(std::complex<float>), - //nrBytesWritten = (size_t) ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrSamplesPerChannel() * ps.nrChannelsPerSubband() * sizeof(std::complex<float>); - nrBytesWritten = (size_t) ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * sizeof(std::complex<float>); + nrBytesRead = (size_t) ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(0) * NR_POLARIZATIONS * sizeof(std::complex<float>), + //nrBytesWritten = (size_t) ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrSamplesPerChannel() * ps.nrChannelsPerSubband() * sizeof(std::complex<float>); + nrBytesWritten = (size_t) ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * sizeof(std::complex<float>); } }; @@ -871,7 +994,7 @@ class Dedispersion_FFT_Kernel void enqueue(cl::CommandQueue &queue, PerformanceCounter &counter, clFFT_Direction direction) { - size_t nrFFTs = (size_t) ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() / ps.dedispersionFFTsize(); + size_t nrFFTs = (size_t) ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() / ps.dedispersionFFTsize(); cl_int error = clFFT_ExecuteInterleaved(queue(), plan.plan, nrFFTs, direction, buffer(), buffer(), 0, 0, &event()); @@ -896,7 +1019,7 @@ class DedispersionForwardFFTkernel : public FFT_Kernel public: DedispersionForwardFFTkernel(const Parset &ps, cl::Context &context, cl::Buffer &buffer) : - FFT_Kernel(context, ps.dedispersionFFTsize(), ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() / ps.dedispersionFFTsize(), true, buffer) + FFT_Kernel(context, ps.dedispersionFFTsize(), ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() / ps.dedispersionFFTsize(), true, buffer) { ASSERT(ps.nrSamplesPerChannel() % ps.dedispersionFFTsize() == 0); } @@ -908,7 +1031,7 @@ class DedispersionBackwardFFTkernel : public FFT_Kernel public: DedispersionBackwardFFTkernel(const Parset &ps, cl::Context &context, cl::Buffer &buffer) : - FFT_Kernel(context, ps.dedispersionFFTsize(), ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() / ps.dedispersionFFTsize(), false, buffer) + FFT_Kernel(context, ps.dedispersionFFTsize(), ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() / ps.dedispersionFFTsize(), false, buffer) { ASSERT(ps.nrSamplesPerChannel() % ps.dedispersionFFTsize() == 0); } @@ -946,8 +1069,8 @@ class DedispersionChirpKernel : public Kernel //std::cout << "localWorkSize = NDRange(" << fftSize / divisor << ", 1, 1))" << std::endl; } - nrOperations = (size_t) NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * (9 * ps.nrTABs(SAP) + 17), - nrBytesRead = nrBytesWritten = sizeof(std::complex<float>) * ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel(); + nrOperations = (size_t) NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * (9 * ps.nrTABs(0) + 17), + nrBytesRead = nrBytesWritten = sizeof(std::complex<float>) * ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel(); } void enqueue(cl::CommandQueue &queue, PerformanceCounter &counter, double subbandFrequency) @@ -970,12 +1093,12 @@ class CoherentStokesKernel : public Kernel setArg(0, devStokesData); setArg(1, devComplexVoltages); - globalWorkSize = cl::NDRange(256, (ps.nrTABs(SAP) + 15) / 16, (ps.nrChannelsPerSubband() + 15) / 16); + globalWorkSize = cl::NDRange(256, (ps.nrTABs(0) + 15) / 16, (ps.nrChannelsPerSubband() + 15) / 16); localWorkSize = cl::NDRange(256, 1, 1); - nrOperations = (size_t) ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(SAP) * (ps.nrCoherentStokes() == 1 ? 8 : 20 + 2.0 / ps.coherentStokesTimeIntegrationFactor()); - nrBytesRead = (size_t) ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(SAP) * NR_POLARIZATIONS * sizeof(std::complex<float>); - nrBytesWritten = (size_t) ps.nrTABs(SAP) * ps.nrCoherentStokes() * ps.nrSamplesPerChannel() / ps.coherentStokesTimeIntegrationFactor() * ps.nrChannelsPerSubband() * sizeof(float); + nrOperations = (size_t) ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(0) * (ps.nrCoherentStokes() == 1 ? 8 : 20 + 2.0 / ps.coherentStokesTimeIntegrationFactor()); + nrBytesRead = (size_t) ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(0) * NR_POLARIZATIONS * sizeof(std::complex<float>); + nrBytesWritten = (size_t) ps.nrTABs(0) * ps.nrCoherentStokes() * ps.nrSamplesPerChannel() / ps.coherentStokesTimeIntegrationFactor() * ps.nrChannelsPerSubband() * sizeof(float); } }; @@ -992,31 +1115,31 @@ class UHEP_BeamFormerKernel : public Kernel setArg(2, devBeamFormerWeights); #if 1 - globalWorkSize = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(SAP), ps.nrSubbands()); - localWorkSize = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(SAP), 1); + globalWorkSize = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(0), ps.nrSubbands()); + localWorkSize = cl::NDRange(NR_POLARIZATIONS, ps.nrTABs(0), 1); size_t count = ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * NR_POLARIZATIONS; - size_t nrWeightsBytes = ps.nrStations() * ps.nrTABs(SAP) * ps.nrSubbands() * NR_POLARIZATIONS * sizeof(std::complex<float>); + size_t nrWeightsBytes = ps.nrStations() * ps.nrTABs(0) * ps.nrSubbands() * NR_POLARIZATIONS * sizeof(std::complex<float>); size_t nrSampleBytes = count * ps.nrStations() * ps.nrBytesPerComplexSample(); - size_t nrComplexVoltagesBytesPerPass = count * ps.nrTABs(SAP) * sizeof(std::complex<float>); + size_t nrComplexVoltagesBytesPerPass = count * ps.nrTABs(0) * sizeof(std::complex<float>); unsigned nrPasses = std::max((ps.nrStations() + 6) / 16, 1U); - nrOperations = count * ps.nrStations() * ps.nrTABs(SAP) * 8; + nrOperations = count * ps.nrStations() * ps.nrTABs(0) * 8; nrBytesRead = nrWeightsBytes + nrSampleBytes + (nrPasses - 1) * nrComplexVoltagesBytesPerPass; nrBytesWritten = nrPasses * nrComplexVoltagesBytesPerPass; #else - ASSERT(ps.nrTABs(SAP) % 3 == 0); + ASSERT(ps.nrTABs(0) % 3 == 0); ASSERT(ps.nrStations() % 6 == 0); - unsigned nrThreads = NR_POLARIZATIONS * (ps.nrTABs(SAP) / 3) * (ps.nrStations() / 6); + unsigned nrThreads = NR_POLARIZATIONS * (ps.nrTABs(0) / 3) * (ps.nrStations() / 6); globalWorkSize = cl::NDRange(nrThreads, ps.nrSubbands()); localWorkSize = cl::NDRange(nrThreads, 1); - //globalWorkSize = cl::NDRange(ps.nrStations() / 6, ps.nrTABs(SAP) / 3, ps.nrSubbands()); - //localWorkSize = cl::NDRange(ps.nrStations() / 6, ps.nrTABs(SAP) / 3, 1); + //globalWorkSize = cl::NDRange(ps.nrStations() / 6, ps.nrTABs(0) / 3, ps.nrSubbands()); + //localWorkSize = cl::NDRange(ps.nrStations() / 6, ps.nrTABs(0) / 3, 1); size_t count = ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * NR_POLARIZATIONS; - size_t nrWeightsBytes = ps.nrStations() * ps.nrTABs(SAP) * ps.nrSubbands() * NR_POLARIZATIONS * sizeof(std::complex<float>); + size_t nrWeightsBytes = ps.nrStations() * ps.nrTABs(0) * ps.nrSubbands() * NR_POLARIZATIONS * sizeof(std::complex<float>); size_t nrSampleBytes = count * ps.nrStations() * ps.nrBytesPerComplexSample(); - size_t nrComplexVoltagesBytes = count * ps.nrTABs(SAP) * sizeof(std::complex<float>); - nrOperations = count * ps.nrStations() * ps.nrTABs(SAP) * 8; + size_t nrComplexVoltagesBytes = count * ps.nrTABs(0) * sizeof(std::complex<float>); + nrOperations = count * ps.nrStations() * ps.nrTABs(0) * 8; nrBytesRead = nrWeightsBytes + nrSampleBytes; nrBytesWritten = nrComplexVoltagesBytes; #endif @@ -1035,12 +1158,12 @@ class UHEP_TransposeKernel : public Kernel setArg(1, devComplexVoltages); setArg(2, devReverseSubbandMapping); - globalWorkSize = cl::NDRange(256, (ps.nrTABs(SAP) + 15) / 16, 512 / 16); + globalWorkSize = cl::NDRange(256, (ps.nrTABs(0) + 15) / 16, 512 / 16); localWorkSize = cl::NDRange(256, 1, 1); nrOperations = 0; - nrBytesRead = (size_t) ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * ps.nrTABs(SAP) * NR_POLARIZATIONS * sizeof(std::complex<float>); - nrBytesWritten = (size_t) ps.nrTABs(SAP) * NR_POLARIZATIONS * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * 512 * sizeof(std::complex<float>); + nrBytesRead = (size_t) ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * ps.nrTABs(0) * NR_POLARIZATIONS * sizeof(std::complex<float>); + nrBytesWritten = (size_t) ps.nrTABs(0) * NR_POLARIZATIONS * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * 512 * sizeof(std::complex<float>); } }; @@ -1055,10 +1178,10 @@ class UHEP_InvFFT_Kernel : public Kernel setArg(0, devFFTedData); setArg(1, devFFTedData); - globalWorkSize = cl::NDRange(128, ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrSamplesPerChannel()); + globalWorkSize = cl::NDRange(128, ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrSamplesPerChannel()); localWorkSize = cl::NDRange(128, 1); - size_t nrFFTs = (size_t) ps.nrTABs(SAP) * NR_POLARIZATIONS * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1); + size_t nrFFTs = (size_t) ps.nrTABs(0) * NR_POLARIZATIONS * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1); nrOperations = nrFFTs * 5 * 1024 * 10; nrBytesRead = nrFFTs * 512 * sizeof(std::complex<float>); nrBytesWritten = nrFFTs * 1024 * sizeof(float); @@ -1083,10 +1206,10 @@ class UHEP_InvFIR_Kernel : public Kernel for (nrThreads = 1024; nrThreads > maxNrThreads; nrThreads /= 2) ; - globalWorkSize = cl::NDRange(1024, NR_POLARIZATIONS, ps.nrTABs(SAP)); + globalWorkSize = cl::NDRange(1024, NR_POLARIZATIONS, ps.nrTABs(0)); localWorkSize = cl::NDRange(nrThreads, 1, 1); - size_t count = ps.nrTABs(SAP) * NR_POLARIZATIONS * 1024; + size_t count = ps.nrTABs(0) * NR_POLARIZATIONS * 1024; nrOperations = count * ps.nrSamplesPerChannel() * NR_STATION_FILTER_TAPS * 2; nrBytesRead = count * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * sizeof(float); nrBytesWritten = count * ps.nrSamplesPerChannel() * sizeof(float); @@ -1104,12 +1227,12 @@ class UHEP_TriggerKernel : public Kernel setArg(0, devTriggerInfo); setArg(1, devInvFIRfilteredData); - globalWorkSize = cl::NDRange(16, 16, ps.nrTABs(SAP)); + globalWorkSize = cl::NDRange(16, 16, ps.nrTABs(0)); localWorkSize = cl::NDRange(16, 16, 1); - nrOperations = (size_t) ps.nrTABs(SAP) * ps.nrSamplesPerChannel() * 1024 * (3 /* power */ + 2 /* window */ + 1 /* max */ + 7 /* mean/variance */); - nrBytesRead = (size_t) ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrSamplesPerChannel() * 1024 * sizeof(float); - nrBytesWritten = (size_t) ps.nrTABs(SAP) * sizeof(TriggerInfo); + nrOperations = (size_t) ps.nrTABs(0) * ps.nrSamplesPerChannel() * 1024 * (3 /* power */ + 2 /* window */ + 1 /* max */ + 7 /* mean/variance */); + nrBytesRead = (size_t) ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrSamplesPerChannel() * 1024 * sizeof(float); + nrBytesWritten = (size_t) ps.nrTABs(0) * sizeof(TriggerInfo); } }; @@ -1143,6 +1266,8 @@ CorrelatorWorkQueue::CorrelatorWorkQueue(CorrelatorPipeline &pipeline) inputSamples(boost::extents[ps.nrStations()][(ps.nrSamplesPerChannel() + NR_TAPS - 1) * ps.nrChannelsPerSubband()][NR_POLARIZATIONS][ps.nrBytesPerComplexSample()], queue, CL_MEM_WRITE_ONLY, devBufferA), visibilities(boost::extents[ps.nrBaselines()][ps.nrChannelsPerSubband()][NR_POLARIZATIONS][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, devBufferB) { + memset(inputSamples.origin(), 0, inputSamples.bytesize()); // FIXME + memset(visibilities.origin(), 0, visibilities.bytesize()); // FIXME size_t firWeightsSize = ps.nrChannelsPerSubband() * NR_TAPS * sizeof(float); devFIRweights = cl::Buffer(pipeline.context, CL_MEM_READ_ONLY, firWeightsSize); queue.enqueueWriteBuffer(devFIRweights, CL_TRUE, 0, ps.nrChannelsPerSubband() * NR_TAPS * sizeof(float), pipeline.filterBank.getWeights().origin()); @@ -1168,7 +1293,12 @@ void CorrelatorWorkQueue::doWork() FIR_FilterKernel firFilterKernel(ps, queue, pipeline.firFilterProgram, devFilteredData, inputSamples, devFIRweights); Filter_FFT_Kernel fftKernel(ps, pipeline.context, devFilteredData); DelayAndBandPassKernel delayAndBandPassKernel(ps, pipeline.delayAndBandPassProgram, devCorrectedData, devFilteredData, delaysAtBegin, delaysAfterEnd, phaseOffsets, bandPassCorrectionWeights); +#if defined USE_NEW_CORRELATOR + CorrelateTriangleKernel correlateTriangleKernel(ps, queue, pipeline.correlatorProgram, visibilities, devCorrectedData); + CorrelateRectangleKernel correlateRectangleKernel(ps, queue, pipeline.correlatorProgram, visibilities, devCorrectedData); +#else CorrelatorKernel correlatorKernel(ps, queue, pipeline.correlatorProgram, visibilities, devCorrectedData); +#endif double startTime = ps.startTime(), currentTime, stopTime = ps.stopTime(), blockTime = ps.CNintegrationTime(); #pragma omp barrier @@ -1176,7 +1306,7 @@ void CorrelatorWorkQueue::doWork() double executionStartTime = getTime(); for (unsigned block = 0; (currentTime = startTime + block * blockTime) < stopTime; block ++) { -#pragma omp single +#pragma omp single nowait #pragma omp critical (cout) std::cout << "block = " << block << ", time = " << to_simple_string(from_ustime_t(currentTime)) << std::endl; @@ -1185,17 +1315,14 @@ void CorrelatorWorkQueue::doWork() memset(phaseOffsets.origin(), 0, phaseOffsets.bytesize()); // FIXME!!! - if (ps.nrStations() >= 3) - delaysAtBegin[0][2][0] = 1e-6, delaysAfterEnd[0][2][0] = 1.1e-6; + //if (ps.nrStations() >= 3) + //delaysAtBegin[0][2][0] = 1e-6, delaysAfterEnd[0][2][0] = 1.1e-6; delaysAtBegin.hostToDevice(CL_FALSE); delaysAfterEnd.hostToDevice(CL_FALSE); phaseOffsets.hostToDevice(CL_FALSE); - queue.finish(); -#pragma omp barrier - -#pragma omp for schedule(dynamic) +#pragma omp for schedule(dynamic), nowait for (unsigned subband = 0; subband < ps.nrSubbands(); subband ++) { try { #if defined USE_TEST_DATA @@ -1217,7 +1344,12 @@ void CorrelatorWorkQueue::doWork() } delayAndBandPassKernel.enqueue(queue, pipeline.delayAndBandPassCounter, subband); +#if defined USE_NEW_CORRELATOR + correlateTriangleKernel.enqueue(queue, pipeline.correlateTriangleCounter); + correlateRectangleKernel.enqueue(queue, pipeline.correlateRectangleCounter); +#else correlatorKernel.enqueue(queue, pipeline.correlatorCounter); +#endif queue.finish(); { @@ -1276,11 +1408,11 @@ BeamFormerWorkQueue::BeamFormerWorkQueue(BeamFormerPipeline &pipeline) delaysAfterEnd(boost::extents[ps.nrBeams()][ps.nrStations()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY), phaseOffsets(boost::extents[ps.nrBeams()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY), devCorrectedData(cl::Buffer(pipeline.context, CL_MEM_READ_WRITE, ps.nrStations() * ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * NR_POLARIZATIONS * sizeof(std::complex<float>))), - beamFormerWeights(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrTABs(SAP)], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY), - devComplexVoltages(cl::Buffer(pipeline.context, CL_MEM_READ_WRITE, ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(SAP) * NR_POLARIZATIONS * sizeof(std::complex<float>))), - //transposedComplexVoltages(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrSamplesPerChannel()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE) - transposedComplexVoltages(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE), - DMs(boost::extents[ps.nrTABs(SAP)], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY) + beamFormerWeights(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrTABs(0)], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY), + devComplexVoltages(cl::Buffer(pipeline.context, CL_MEM_READ_WRITE, ps.nrChannelsPerSubband() * ps.nrSamplesPerChannel() * ps.nrTABs(0) * NR_POLARIZATIONS * sizeof(std::complex<float>))), + //transposedComplexVoltages(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrSamplesPerChannel()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE) + transposedComplexVoltages(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE), + DMs(boost::extents[ps.nrTABs(0)], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY) { if (ps.correctBandPass()) { BandPass::computeCorrectionFactors(bandPassCorrectionWeights.origin(), ps.nrChannelsPerSubband()); @@ -1310,7 +1442,7 @@ void BeamFormerWorkQueue::doWork() double executionStartTime = getTime(); for (unsigned block = 0; (currentTime = startTime + block * blockTime) < stopTime; block ++) { -#pragma omp single +#pragma omp single nowait #pragma omp critical (cout) std::cout << "block = " << block << ", time = " << to_simple_string(from_ustime_t(currentTime)) << std::endl; @@ -1326,11 +1458,8 @@ void BeamFormerWorkQueue::doWork() delaysAfterEnd.hostToDevice(CL_FALSE); phaseOffsets.hostToDevice(CL_FALSE); beamFormerWeights.hostToDevice(CL_FALSE); - queue.finish(); - -#pragma omp barrier -#pragma omp for schedule(dynamic) +#pragma omp for schedule(dynamic), nowait for (unsigned subband = 0; subband < ps.nrSubbands(); subband ++) { try { #if 1 @@ -1384,13 +1513,13 @@ UHEP_WorkQueue::UHEP_WorkQueue(UHEP_Pipeline &pipeline) WorkQueue(pipeline), pipeline(pipeline), hostInputSamples(boost::extents[ps.nrStations()][ps.nrSubbands()][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][NR_POLARIZATIONS][ps.nrBytesPerComplexSample()], queue, CL_MEM_WRITE_ONLY), - hostBeamFormerWeights(boost::extents[ps.nrStations()][ps.nrSubbands()][ps.nrTABs(SAP)], queue, CL_MEM_WRITE_ONLY), - hostTriggerInfo(ps.nrTABs(SAP), queue, CL_MEM_READ_ONLY) + hostBeamFormerWeights(boost::extents[ps.nrStations()][ps.nrSubbands()][ps.nrTABs(0)], queue, CL_MEM_WRITE_ONLY), + hostTriggerInfo(ps.nrTABs(0), queue, CL_MEM_READ_ONLY) { size_t inputSamplesSize = ps.nrStations() * ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * NR_POLARIZATIONS * ps.nrBytesPerComplexSample(); - size_t complexVoltagesSize = ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * ps.nrTABs(SAP) * NR_POLARIZATIONS * sizeof(std::complex<float>); - size_t transposedDataSize = ps.nrTABs(SAP) * NR_POLARIZATIONS * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * 512 * sizeof(std::complex<float>); - size_t invFIRfilteredDataSize = ps.nrTABs(SAP) * NR_POLARIZATIONS * ps.nrSamplesPerChannel() * 512 * sizeof(std::complex<float>); + size_t complexVoltagesSize = ps.nrSubbands() * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * ps.nrTABs(0) * NR_POLARIZATIONS * sizeof(std::complex<float>); + size_t transposedDataSize = ps.nrTABs(0) * NR_POLARIZATIONS * (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) * 512 * sizeof(std::complex<float>); + size_t invFIRfilteredDataSize = ps.nrTABs(0) * NR_POLARIZATIONS * ps.nrSamplesPerChannel() * 512 * sizeof(std::complex<float>); size_t buffer0size = std::max(inputSamplesSize, transposedDataSize); size_t buffer1size = std::max(complexVoltagesSize, invFIRfilteredDataSize); @@ -1398,7 +1527,7 @@ UHEP_WorkQueue::UHEP_WorkQueue(UHEP_Pipeline &pipeline) devBuffers[0] = cl::Buffer(pipeline.context, CL_MEM_READ_WRITE, buffer0size); devBuffers[1] = cl::Buffer(pipeline.context, CL_MEM_READ_WRITE, buffer1size); - size_t beamFormerWeightsSize = ps.nrStations() * ps.nrSubbands() * ps.nrTABs(SAP) * sizeof(std::complex<float>); + size_t beamFormerWeightsSize = ps.nrStations() * ps.nrSubbands() * ps.nrTABs(0) * sizeof(std::complex<float>); devBeamFormerWeights = cl::Buffer(pipeline.context, CL_MEM_READ_ONLY, beamFormerWeightsSize); devInputSamples = devBuffers[0]; @@ -1409,7 +1538,7 @@ UHEP_WorkQueue::UHEP_WorkQueue(UHEP_Pipeline &pipeline) devFFTedData = devBuffers[0]; devInvFIRfilteredData = devBuffers[1]; - devTriggerInfo = cl::Buffer(pipeline.context, CL_MEM_WRITE_ONLY, ps.nrTABs(SAP) * sizeof(TriggerInfo)); + devTriggerInfo = cl::Buffer(pipeline.context, CL_MEM_WRITE_ONLY, ps.nrTABs(0) * sizeof(TriggerInfo)); } @@ -1430,12 +1559,12 @@ void UHEP_WorkQueue::doWork(const float * /*delaysAtBegin*/, const float * /*del double executionStartTime = getTime(); -#pragma omp for schedule(dynamic) +#pragma omp for schedule(dynamic), nowait for (unsigned block = 0; block < nrBlocks; block ++) { try { double currentTime = startTime + block * blockTime; -//#pragma omp single // FIXME: why does the compiler complain here??? +//#pragma omp single nowait // FIXME: why does the compiler complain here??? #pragma omp critical (cout) std::cout << "block = " << block << ", time = " << to_simple_string(from_ustime_t(currentTime)) << std::endl; @@ -1607,18 +1736,21 @@ struct CorrelatorTest : public UnitTest { CorrelatorTest(const Parset &ps) : - //UnitTest(ps, "Correlator.cl") +#if defined USE_NEW_CORRELATOR UnitTest(ps, "NewCorrelator.cl") +#else + UnitTest(ps, "Correlator.cl") +#endif { if (ps.nrStations() >= 5 && ps.nrChannelsPerSubband() >= 6 && ps.nrSamplesPerChannel() >= 100) { - MultiArraySharedBuffer<std::complex<float>, 4> inputData(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); MultiArraySharedBuffer<std::complex<float>, 4> visibilities(boost::extents[ps.nrBaselines()][ps.nrChannelsPerSubband()][NR_POLARIZATIONS][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); + MultiArraySharedBuffer<std::complex<float>, 4> inputData(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); CorrelatorKernel correlator(ps, queue, program, visibilities, inputData); //inputData[3][5][99][1] = std::complex<float>(3, 4); //inputData[4][5][99][1] = std::complex<float>(5, 6); - inputData[2][5][99][1] = std::complex<float>(3, 4); - inputData[65][5][99][1] = std::complex<float>(5, 6); + inputData[0][5][99][1] = std::complex<float>(3, 4); + inputData[2][5][99][1] = std::complex<float>(5, 6); visibilities.hostToDevice(CL_FALSE); inputData.hostToDevice(CL_FALSE); @@ -1635,6 +1767,70 @@ visibilities.hostToDevice(CL_FALSE); }; +#if defined USE_NEW_CORRELATOR + +struct CorrelateRectangleTest : public UnitTest +{ + CorrelateRectangleTest(const Parset &ps) + : + //UnitTest(ps, "Correlator.cl") + UnitTest(ps, "NewCorrelator.cl") + { + if (ps.nrStations() >= 5 && ps.nrChannelsPerSubband() >= 6 && ps.nrSamplesPerChannel() >= 100) { + MultiArraySharedBuffer<std::complex<float>, 4> visibilities(boost::extents[ps.nrBaselines()][ps.nrChannelsPerSubband()][NR_POLARIZATIONS][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); + MultiArraySharedBuffer<std::complex<float>, 4> inputData(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); + CorrelateRectangleKernel correlator(ps, queue, program, visibilities, inputData); + + inputData[27][5][99][1] = std::complex<float>(3, 4); + inputData[68][5][99][1] = std::complex<float>(5, 6); + +visibilities.hostToDevice(CL_FALSE); + inputData.hostToDevice(CL_FALSE); + correlator.enqueue(queue, counter); + visibilities.deviceToHost(CL_TRUE); + + //check(visibilities[5463][5][1][1], std::complex<float>(39, 2)); + for (unsigned bl = 0; bl < ps.nrBaselines(); bl ++) + if (visibilities[bl][5][1][1] != std::complex<float>(0, 0)) + std::cout << "bl = " << bl << ", visibility = " << visibilities[bl][5][1][1] << std::endl; + } + } +}; + + +struct CorrelateTriangleTest : public UnitTest +{ + CorrelateTriangleTest(const Parset &ps) + : + //UnitTest(ps, "Correlator.cl") + UnitTest(ps, "NewCorrelator.cl") + { + if (ps.nrStations() >= 5 && ps.nrChannelsPerSubband() >= 6 && ps.nrSamplesPerChannel() >= 100) { + MultiArraySharedBuffer<std::complex<float>, 4> visibilities(boost::extents[ps.nrBaselines()][ps.nrChannelsPerSubband()][NR_POLARIZATIONS][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); + MultiArraySharedBuffer<std::complex<float>, 4> inputData(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); + CorrelateTriangleKernel correlator(ps, queue, program, visibilities, inputData); + + //inputData[3][5][99][1] = std::complex<float>(3, 4); + //inputData[4][5][99][1] = std::complex<float>(5, 6); + inputData[0][5][99][1] = std::complex<float>(3, 4); + inputData[2][5][99][1] = std::complex<float>(5, 6); + +visibilities.hostToDevice(CL_FALSE); + inputData.hostToDevice(CL_FALSE); + correlator.enqueue(queue, counter); + visibilities.deviceToHost(CL_TRUE); + + //check(visibilities[13][5][1][1], std::complex<float>(39, 2)); + for (unsigned bl = 0; bl < ps.nrBaselines(); bl ++) + if (visibilities[bl][5][1][1] != std::complex<float>(0, 0)) + std::cout << "bl = " << bl << ", visibility = " << visibilities[bl][5][1][1] << std::endl; + } + } +}; + +#endif + + struct IncoherentStokesTest : public UnitTest { IncoherentStokesTest(const Parset &ps) @@ -1699,10 +1895,10 @@ struct BeamFormerTest : public UnitTest : UnitTest(ps, "BeamFormer/BeamFormer.cl") { - if (ps.nrStations() >= 5 && ps.nrSamplesPerChannel() >= 13 && ps.nrChannelsPerSubband() >= 7 && ps.nrTABs(SAP) >= 6) { + if (ps.nrStations() >= 5 && ps.nrSamplesPerChannel() >= 13 && ps.nrChannelsPerSubband() >= 7 && ps.nrTABs(0) >= 6) { MultiArraySharedBuffer<std::complex<float>, 4> inputData(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][NR_POLARIZATIONS], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); - MultiArraySharedBuffer<std::complex<float>, 3> beamFormerWeights(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrTABs(SAP)], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); - MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][ps.nrTABs(SAP)][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE); + MultiArraySharedBuffer<std::complex<float>, 3> beamFormerWeights(boost::extents[ps.nrStations()][ps.nrChannelsPerSubband()][ps.nrTABs(0)], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); + MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][ps.nrTABs(0)][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE); BeamFormerKernel beamFormer(ps, program, complexVoltages, inputData, beamFormerWeights); inputData[4][6][12][1] = std::complex<float>(2.2, 3); @@ -1716,7 +1912,7 @@ struct BeamFormerTest : public UnitTest check(complexVoltages[6][12][5][1], std::complex<float>(-6.2, 23)); #if 0 - for (unsigned tab = 0; tab < ps.nrTABs(SAP); tab ++) + for (unsigned tab = 0; tab < ps.nrTABs(0); tab ++) for (unsigned pol = 0; pol < NR_POLARIZATIONS; pol ++) for (unsigned ch = 0; ch < ps.nrChannelsPerSubband(); ch ++) for (unsigned t = 0; t < ps.nrSamplesPerChannel(); t ++) @@ -1734,9 +1930,9 @@ struct BeamFormerTransposeTest : public UnitTest : UnitTest(ps, "BeamFormer/Transpose.cl") { - if (ps.nrChannelsPerSubband() >= 19 && ps.nrSamplesPerChannel() >= 175 && ps.nrTABs(SAP) >= 5) { - MultiArraySharedBuffer<std::complex<float>, 4> transposedData(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrSamplesPerChannel()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); - MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][ps.nrTABs(SAP)][NR_POLARIZATIONS], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY); + if (ps.nrChannelsPerSubband() >= 19 && ps.nrSamplesPerChannel() >= 175 && ps.nrTABs(0) >= 5) { + MultiArraySharedBuffer<std::complex<float>, 4> transposedData(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrSamplesPerChannel()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); + MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][ps.nrTABs(0)][NR_POLARIZATIONS], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY); BeamFormerTransposeKernel transpose(ps, program, transposedData, complexVoltages); complexVoltages[18][174][4][1] = std::complex<float>(24, 42); @@ -1757,9 +1953,9 @@ struct DedispersionChirpTest : public UnitTest : UnitTest(ps, "BeamFormer/Dedispersion.cl") { - if (ps.nrTABs(SAP) > 3 && ps.nrChannelsPerSubband() > 13 && ps.nrSamplesPerChannel() / ps.dedispersionFFTsize() > 1 && ps.dedispersionFFTsize() > 77) { - MultiArraySharedBuffer<std::complex<float>, 5> data(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel() / ps.dedispersionFFTsize()][ps.dedispersionFFTsize()], queue, CL_MEM_READ_WRITE, CL_MEM_READ_WRITE); - MultiArraySharedBuffer<float, 1> DMs(boost::extents[ps.nrTABs(SAP)], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); + if (ps.nrTABs(0) > 3 && ps.nrChannelsPerSubband() > 13 && ps.nrSamplesPerChannel() / ps.dedispersionFFTsize() > 1 && ps.dedispersionFFTsize() > 77) { + MultiArraySharedBuffer<std::complex<float>, 5> data(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel() / ps.dedispersionFFTsize()][ps.dedispersionFFTsize()], queue, CL_MEM_READ_WRITE, CL_MEM_READ_WRITE); + MultiArraySharedBuffer<float, 1> DMs(boost::extents[ps.nrTABs(0)], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); DedispersionChirpKernel dedispersionChirpKernel(ps, program, queue, data, DMs); data[3][1][13][1][77] = std::complex<float>(2, 3); @@ -1782,16 +1978,16 @@ struct CoherentStokesTest : public UnitTest : UnitTest(ps, "BeamFormer/CoherentStokes.cl") { - if (ps.nrChannelsPerSubband() >= 19 && ps.nrSamplesPerChannel() >= 175 && ps.nrTABs(SAP) >= 5) { - MultiArraySharedBuffer<float, 4> stokesData(boost::extents[ps.nrTABs(SAP)][ps.nrCoherentStokes()][ps.nrSamplesPerChannel() / ps.coherentStokesTimeIntegrationFactor()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); + if (ps.nrChannelsPerSubband() >= 19 && ps.nrSamplesPerChannel() >= 175 && ps.nrTABs(0) >= 5) { + MultiArraySharedBuffer<float, 4> stokesData(boost::extents[ps.nrTABs(0)][ps.nrCoherentStokes()][ps.nrSamplesPerChannel() / ps.coherentStokesTimeIntegrationFactor()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); #if 1 - MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][ps.nrTABs(SAP)][NR_POLARIZATIONS], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY); + MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrChannelsPerSubband()][ps.nrSamplesPerChannel()][ps.nrTABs(0)][NR_POLARIZATIONS], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY); CoherentStokesKernel stokesKernel(ps, program, stokesData, complexVoltages); complexVoltages[18][174][4][0] = std::complex<float>(2, 3); complexVoltages[18][174][4][1] = std::complex<float>(4, 5); #else - MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrSamplesPerChannel()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY); + MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrSamplesPerChannel()][ps.nrChannelsPerSubband()], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY); CoherentStokesKernel stokesKernel(ps, program, stokesData, complexVoltages); complexVoltages[18][174][4][0] = std::complex<float>(2, 3); @@ -1815,10 +2011,10 @@ struct UHEP_BeamFormerTest : public UnitTest : UnitTest(ps, "UHEP/BeamFormer.cl") { - if (ps.nrStations() >= 5 && (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) >= 13 && ps.nrSubbands() >= 7 && ps.nrTABs(SAP) >= 6) { + if (ps.nrStations() >= 5 && (ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1) >= 13 && ps.nrSubbands() >= 7 && ps.nrTABs(0) >= 6) { MultiArraySharedBuffer<char, 5> inputSamples(boost::extents[ps.nrStations()][ps.nrSubbands()][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][NR_POLARIZATIONS][ps.nrBytesPerComplexSample()], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); - MultiArraySharedBuffer<std::complex<float>, 3> beamFormerWeights(boost::extents[ps.nrStations()][ps.nrSubbands()][ps.nrTABs(SAP)], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); - MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrSubbands()][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][ps.nrTABs(SAP)][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE); + MultiArraySharedBuffer<std::complex<float>, 3> beamFormerWeights(boost::extents[ps.nrStations()][ps.nrSubbands()][ps.nrTABs(0)], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); + MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrSubbands()][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][ps.nrTABs(0)][NR_POLARIZATIONS], queue, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE); UHEP_BeamFormerKernel beamFormer(ps, program, complexVoltages, inputSamples, beamFormerWeights); switch (ps.nrBytesPerComplexSample()) { @@ -1851,9 +2047,9 @@ struct UHEP_TransposeTest : public UnitTest : UnitTest(ps, "UHEP/Transpose.cl") { - if (ps.nrSubbands() >= 19 && ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1 >= 175 && ps.nrTABs(SAP) >= 5) { - MultiArraySharedBuffer<std::complex<float>, 4> transposedData(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][512], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); - MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrSubbands()][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][ps.nrTABs(SAP)][NR_POLARIZATIONS], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY); + if (ps.nrSubbands() >= 19 && ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1 >= 175 && ps.nrTABs(0) >= 5) { + MultiArraySharedBuffer<std::complex<float>, 4> transposedData(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][512], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); + MultiArraySharedBuffer<std::complex<float>, 4> complexVoltages(boost::extents[ps.nrSubbands()][ps.nrSamplesPerChannel() + NR_STATION_FILTER_TAPS - 1][ps.nrTABs(0)][NR_POLARIZATIONS], queue, CL_MEM_READ_WRITE, CL_MEM_READ_ONLY); cl::Buffer devReverseSubbandMapping(context, CL_MEM_READ_ONLY, 512 * sizeof(int)); UHEP_TransposeKernel transpose(ps, program, transposedData, complexVoltages, devReverseSubbandMapping); @@ -1876,9 +2072,9 @@ struct UHEP_TriggerTest : public UnitTest : UnitTest(ps, "UHEP/Trigger.cl") { - if (ps.nrTABs(SAP) >= 4 && 1024 * ps.nrSamplesPerChannel() > 100015) { - MultiArraySharedBuffer<float, 3> inputData(boost::extents[ps.nrTABs(SAP)][NR_POLARIZATIONS][ps.nrSamplesPerChannel() * 1024], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); - MultiArraySharedBuffer<TriggerInfo, 1> triggerInfo(boost::extents[ps.nrTABs(SAP)], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); + if (ps.nrTABs(0) >= 4 && 1024 * ps.nrSamplesPerChannel() > 100015) { + MultiArraySharedBuffer<float, 3> inputData(boost::extents[ps.nrTABs(0)][NR_POLARIZATIONS][ps.nrSamplesPerChannel() * 1024], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); + MultiArraySharedBuffer<TriggerInfo, 1> triggerInfo(boost::extents[ps.nrTABs(0)], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); UHEP_TriggerKernel trigger(ps, program, triggerInfo, inputData); inputData[3][1][100015] = 1000; @@ -1900,7 +2096,7 @@ struct UHEP_TriggerTest : public UnitTest struct FFT_Test : public UnitTest { FFT_Test(const Parset &ps) - : UnitTest(ps, "fft2.cl") + : UnitTest(ps, "fft.cl") { MultiArraySharedBuffer<std::complex<float>, 1> in(boost::extents[8], queue, CL_MEM_WRITE_ONLY, CL_MEM_READ_ONLY); MultiArraySharedBuffer<std::complex<float>, 1> out(boost::extents[8], queue, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY); @@ -1979,34 +2175,14 @@ int main(int argc, char **argv) const char *str = getenv("NR_GPUS"); nrGPUs = str ? atoi(str) : 1; -#if 0 - ps.nrSubbands() = 10;//488; - ps.nrChannelsPerSubband() = 64; - ps.nrBeams() = 1; - ps.nrSamplesPerChannel() = 196608 / ps.nrChannelsPerSubband(); - ps.subbandBandwidth() = 195312.5; - ps.correctBandPass() = true; -#endif + profiling = false; CorrelatorPipeline(ps).doWork(); + profiling = true; CorrelatorPipeline(ps).doWork(); - //profiling = false; CorrelatorPipeline(ps).doWork(); - //profiling = true; CorrelatorPipeline(ps).doWork(); - - (CorrelatorTest)(ps); + //(CorrelatorTest)(ps); + //(CorrelateRectangleTest)(ps); + //(CorrelateTriangleTest)(ps); #if 0 - ps.nrSubbands() = 488; - ps.nrChannelsPerSubband() = 2048; - ps.nrBeams() = 1; - ps.nrTABs(SAP) = 128; - ps.nrIncoherentStokes() = 4; - ps.nrCoherentStokes() = 4; - ps.incoherentStokesTimeIntegrationFactor() = 8; - ps.coherentStokesTimeIntegrationFactor() = 8; - ps.nrSamplesPerChannel() = 65536 / ps.nrChannelsPerSubband();//262144 / ps.nrChannelsPerSubband(); - ps.subbandBandwidth() = 195312.5; - ps.correctBandPass() = true; - ps.dedispersionFFTsize() = ps.nrSamplesPerChannel(); - profiling = false; BeamFormerPipeline(ps).doWork(); profiling = true; BeamFormerPipeline(ps).doWork(); //(IncoherentStokesTest)(ps); @@ -2018,12 +2194,6 @@ int main(int argc, char **argv) #endif #if 0 - ps.nrSubbands() = 488; - ps.nrSamplesPerChannel() = 1024; - ps.nrBeams() = 1; - ps.subbandBandwidth() = 195312.5; - ps.nrTABs(SAP) = 48; - profiling = false; UHEP_Pipeline(ps).doWork(); profiling = true; UHEP_Pipeline(ps).doWork(); //(UHEP_BeamFormerTest)(ps); diff --git a/RTCP/GPUProc/src/UHEP/BeamFormer.cl-0.ptx b/RTCP/GPUProc/src/UHEP/BeamFormer.cl-0.ptx deleted file mode 100644 index ea220810bb76a4961820e5b0583dc8ffe90325aa..0000000000000000000000000000000000000000 --- a/RTCP/GPUProc/src/UHEP/BeamFormer.cl-0.ptx +++ /dev/null @@ -1,207 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Tue Feb 7 07:15:59 2012 (1328595359) -// Driver 295.20 -// - -.version 3.0 -.target sm_21, texmode_independent -.address_size 32 - -.extern .shared .align 16 .b8 shr_2__local[2048]; - -.entry complexVoltages( - .param .u32 .ptr .global .align 1 complexVoltages_param_0, - .param .u32 .ptr .global .align 1 complexVoltages_param_1, - .param .u32 .ptr .global .align 1 complexVoltages_param_2 -) -{ - .reg .f32 %f<173>; - .reg .pred %p<7>; - .reg .s32 %r<76>; - .reg .s16 %rc<16>; - - - ld.param.u32 %r29, [complexVoltages_param_2]; - // inline asm - mov.u32 %r22, %tid.x; - // inline asm - // inline asm - mov.u32 %r23, %tid.y; - // inline asm - // inline asm - mov.u32 %r24, %envreg5; - // inline asm - // inline asm - mov.u32 %r25, %ntid.z; - // inline asm - // inline asm - mov.u32 %r26, %ctaid.z; - // inline asm - // inline asm - mov.u32 %r27, %tid.z; - // inline asm - add.s32 %r30, %r27, %r24; - mad.lo.s32 %r4, %r26, %r25, %r30; - shl.b32 %r31, %r23, 4; - mad.lo.s32 %r32, %r4, 66496, %r31; - shl.b32 %r33, %r22, 3; - add.s32 %r5, %r32, %r33; - shl.b32 %r34, %r4, 5; - shl.b32 %r35, %r23, 3; - add.s32 %r36, %r34, %r35; - add.s32 %r37, %r29, %r36; - ld.global.v2.f32 {%f131, %f132}, [%r37]; - ld.global.v2.f32 {%f137, %f138}, [%r37+15616]; - ld.global.v2.f32 {%f143, %f144}, [%r37+31232]; - ld.global.v2.f32 {%f149, %f150}, [%r37+46848]; - ld.global.v2.f32 {%f155, %f156}, [%r37+62464]; - ld.global.v2.f32 {%f161, %f162}, [%r37+78080]; - ld.global.v2.f32 {%f167, %f168}, [%r37+93696]; - mov.u32 %r72, 0; - -BB0_1: - shl.b32 %r40, %r72, 10; - add.s32 %r7, %r5, %r40; - // inline asm - mov.u32 %r38, %tid.x; - // inline asm - // inline asm - mov.u32 %r39, %tid.y; - // inline asm - shl.b32 %r9, %r39, 1; - add.s32 %r74, %r9, %r38; - setp.gt.u32 %p1, %r74, 127; - @%p1 bra BB0_6; - - add.s32 %r11, %r38, %r9; - cvt.u8.u32 %rc1, %r11; - mov.u16 %rc15, 0; - mov.u32 %r73, 0; - -BB0_3: - add.s32 %r14, %r11, %r73; - shl.b32 %r42, %r72, 4; - shl.b16 %rc5, %rc15, 3; - add.s16 %rc6, %rc1, %rc5; - cvt.u32.u8 %r43, %rc6; - and.b32 %r44, %r43, 15; - add.s32 %r15, %r42, %r44; - setp.gt.u32 %p2, %r15, 1038; - @%p2 bra BB0_5; - - shr.u32 %r45, %r14, 4; - ld.param.u32 %r71, [complexVoltages_param_1]; - mad.lo.s32 %r46, %r45, 2028128, %r71; - mad.lo.s32 %r47, %r4, 4156, %r46; - shl.b32 %r48, %r15, 2; - add.s32 %r49, %r47, %r48; - ld.global.v4.u8 {%rc11, %rc12, %rc13, %rc14}, [%r49]; - // inline asm - cvt.rn.f32.s8 %f1, %rc11; - // inline asm - // inline asm - cvt.rn.f32.s8 %f2, %rc12; - // inline asm - // inline asm - cvt.rn.f32.s8 %f3, %rc13; - // inline asm - // inline asm - cvt.rn.f32.s8 %f4, %rc14; - // inline asm - shl.b32 %r50, %r74, 4; - mov.u32 %r51, shr_2__local; - add.s32 %r52, %r51, %r50; - st.shared.v4.f32 [%r52], {%f1, %f2, %f3, %f4}; - -BB0_5: - add.s32 %r74, %r74, 8; - setp.lt.u32 %p3, %r74, 128; - add.s32 %r73, %r73, 8; - add.s16 %rc15, %rc15, 1; - @%p3 bra BB0_3; - -BB0_6: - bar.sync 0; - mov.u32 %r56, 1039; - shl.b32 %r57, %r72, 4; - sub.s32 %r18, %r56, %r57; - mov.u32 %r54, 16; - // inline asm - min.u32 %r53, %r54, %r18; - // inline asm - setp.eq.s32 %p4, %r53, 0; - @%p4 bra BB0_9; - - mov.u32 %r75, 0; - -BB0_8: - shl.b32 %r62, %r75, 6; - add.s32 %r63, %r7, %r62; - ld.param.u32 %r70, [complexVoltages_param_0]; - add.s32 %r64, %r70, %r63; - shl.b32 %r65, %r75, 4; - mov.u32 %r66, shr_2__local; - add.s32 %r67, %r66, %r65; - add.s32 %r69, %r67, %r33; - ld.shared.v2.f32 {%f27, %f28}, [%r69]; - mov.f32 %f5, 0f00000000; - fma.rn.ftz.f32 %f31, %f131, %f27, %f5; - fma.rn.ftz.f32 %f32, %f131, %f28, %f5; - neg.ftz.f32 %f7, %f28; - fma.rn.ftz.f32 %f39, %f132, %f7, %f31; - fma.rn.ftz.f32 %f40, %f132, %f27, %f32; - ld.shared.v2.f32 {%f43, %f44}, [%r69+256]; - fma.rn.ftz.f32 %f45, %f137, %f43, %f39; - fma.rn.ftz.f32 %f46, %f137, %f44, %f40; - neg.ftz.f32 %f10, %f44; - fma.rn.ftz.f32 %f53, %f138, %f10, %f45; - fma.rn.ftz.f32 %f54, %f138, %f43, %f46; - ld.shared.v2.f32 {%f57, %f58}, [%r69+512]; - fma.rn.ftz.f32 %f59, %f143, %f57, %f53; - fma.rn.ftz.f32 %f60, %f143, %f58, %f54; - neg.ftz.f32 %f13, %f58; - fma.rn.ftz.f32 %f67, %f144, %f13, %f59; - fma.rn.ftz.f32 %f68, %f144, %f57, %f60; - ld.shared.v2.f32 {%f71, %f72}, [%r69+768]; - fma.rn.ftz.f32 %f73, %f149, %f71, %f67; - fma.rn.ftz.f32 %f74, %f149, %f72, %f68; - neg.ftz.f32 %f16, %f72; - fma.rn.ftz.f32 %f81, %f150, %f16, %f73; - fma.rn.ftz.f32 %f82, %f150, %f71, %f74; - ld.shared.v2.f32 {%f85, %f86}, [%r69+1024]; - fma.rn.ftz.f32 %f87, %f155, %f85, %f81; - fma.rn.ftz.f32 %f88, %f155, %f86, %f82; - neg.ftz.f32 %f19, %f86; - fma.rn.ftz.f32 %f95, %f156, %f19, %f87; - fma.rn.ftz.f32 %f96, %f156, %f85, %f88; - ld.shared.v2.f32 {%f99, %f100}, [%r69+1280]; - fma.rn.ftz.f32 %f101, %f161, %f99, %f95; - fma.rn.ftz.f32 %f102, %f161, %f100, %f96; - neg.ftz.f32 %f22, %f100; - fma.rn.ftz.f32 %f109, %f162, %f22, %f101; - fma.rn.ftz.f32 %f110, %f162, %f99, %f102; - ld.shared.v2.f32 {%f113, %f114}, [%r69+1536]; - fma.rn.ftz.f32 %f115, %f167, %f113, %f109; - fma.rn.ftz.f32 %f116, %f167, %f114, %f110; - neg.ftz.f32 %f25, %f114; - fma.rn.ftz.f32 %f123, %f168, %f25, %f115; - fma.rn.ftz.f32 %f124, %f168, %f113, %f116; - st.global.v2.f32 [%r64], {%f123, %f124}; - // inline asm - min.u32 %r59, %r54, %r18; - // inline asm - add.s32 %r75, %r75, 1; - setp.lt.u32 %p5, %r75, %r59; - @%p5 bra BB0_8; - -BB0_9: - bar.sync 0; - add.s32 %r72, %r72, 1; - setp.ne.s32 %p6, %r72, 65; - @%p6 bra BB0_1; - - ret; -} - - diff --git a/RTCP/GPUProc/src/UHEP/BeamFormer.cl.hop-0.ptx b/RTCP/GPUProc/src/UHEP/BeamFormer.cl.hop-0.ptx deleted file mode 100644 index 79498463a835875a85828f52fd7f756eb2b37aab..0000000000000000000000000000000000000000 Binary files a/RTCP/GPUProc/src/UHEP/BeamFormer.cl.hop-0.ptx and /dev/null differ diff --git a/RTCP/GPUProc/src/UHEP/InvFFT.cl-0.ptx b/RTCP/GPUProc/src/UHEP/InvFFT.cl-0.ptx deleted file mode 100644 index 86d5c2b091806e681f5dfe3df13f2ce5d1d68535..0000000000000000000000000000000000000000 --- a/RTCP/GPUProc/src/UHEP/InvFFT.cl-0.ptx +++ /dev/null @@ -1,759 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Tue Feb 7 07:15:59 2012 (1328595359) -// Driver 295.20 -// - -.version 3.0 -.target sm_21, texmode_independent -.address_size 32 - -.extern .shared .align 32 .b8 shr_3_lds[4096]; - -.entry inv_fft( - .param .u32 .ptr .global .align 8 inv_fft_param_0, - .param .u32 .ptr .global .align 4 inv_fft_param_1 -) -.reqntid 128, 1, 1 -{ - .reg .f32 %f<576>; - .reg .pred %p<2>; - .reg .s32 %r<212>; - - - ld.param.u32 %r26, [inv_fft_param_1]; - // inline asm - mov.u32 %r2, %tid.x; - // inline asm - // inline asm - mov.u32 %r3, %envreg4; - // inline asm - // inline asm - mov.u32 %r4, %ntid.y; - // inline asm - // inline asm - mov.u32 %r5, %ctaid.y; - // inline asm - // inline asm - mov.u32 %r6, %tid.y; - // inline asm - add.s32 %r27, %r6, %r3; - mad.lo.s32 %r28, %r5, %r4, %r27; - shl.b32 %r29, %r28, 12; - add.s32 %r30, %r26, %r29; - shl.b32 %r31, %r2, 3; - add.s32 %r32, %r30, %r31; - ld.global.v2.f32 {%f574, %f575}, [%r32]; - // inline asm - mov.u32 %r7, %tid.x; - // inline asm - // inline asm - mov.u32 %r8, %envreg4; - // inline asm - // inline asm - mov.u32 %r9, %ntid.y; - // inline asm - // inline asm - mov.u32 %r10, %ctaid.y; - // inline asm - // inline asm - mov.u32 %r11, %tid.y; - // inline asm - add.s32 %r33, %r11, %r8; - mad.lo.s32 %r34, %r10, %r9, %r33; - shl.b32 %r35, %r34, 12; - add.s32 %r36, %r26, %r35; - shl.b32 %r37, %r7, 3; - add.s32 %r38, %r37, %r36; - ld.global.v2.f32 {%f516, %f517}, [%r38+1024]; - // inline asm - mov.u32 %r12, %tid.x; - // inline asm - // inline asm - mov.u32 %r13, %envreg4; - // inline asm - // inline asm - mov.u32 %r14, %ntid.y; - // inline asm - // inline asm - mov.u32 %r15, %ctaid.y; - // inline asm - // inline asm - mov.u32 %r16, %tid.y; - // inline asm - add.s32 %r39, %r16, %r13; - mad.lo.s32 %r40, %r15, %r14, %r39; - shl.b32 %r41, %r40, 12; - add.s32 %r42, %r26, %r41; - shl.b32 %r43, %r12, 3; - add.s32 %r44, %r43, %r42; - ld.global.v2.f32 {%f510, %f511}, [%r44+2048]; - // inline asm - mov.u32 %r17, %tid.x; - // inline asm - // inline asm - mov.u32 %r18, %envreg4; - // inline asm - // inline asm - mov.u32 %r19, %ntid.y; - // inline asm - // inline asm - mov.u32 %r20, %ctaid.y; - // inline asm - // inline asm - mov.u32 %r21, %tid.y; - // inline asm - add.s32 %r45, %r21, %r18; - mad.lo.s32 %r46, %r20, %r19, %r45; - shl.b32 %r47, %r46, 12; - add.s32 %r48, %r26, %r47; - shl.b32 %r49, %r17, 3; - add.s32 %r50, %r49, %r48; - ld.global.v2.f32 {%f522, %f523}, [%r50+3072]; - // inline asm - mov.u32 %r22, %tid.x; - // inline asm - shl.b32 %r51, %r22, 3; - mov.u32 %r52, shr_3_lds; - add.s32 %r53, %r52, %r51; - st.shared.v2.f32 [%r53], {%f574, %f575}; - // inline asm - mov.u32 %r23, %tid.x; - // inline asm - shl.b32 %r54, %r23, 3; - add.s32 %r55, %r54, %r52; - st.shared.v2.f32 [%r55+1024], {%f516, %f517}; - // inline asm - mov.u32 %r24, %tid.x; - // inline asm - shl.b32 %r56, %r24, 3; - add.s32 %r57, %r56, %r52; - st.shared.v2.f32 [%r57+2048], {%f510, %f511}; - // inline asm - mov.u32 %r25, %tid.x; - // inline asm - shl.b32 %r58, %r25, 3; - add.s32 %r59, %r58, %r52; - st.shared.v2.f32 [%r59+3072], {%f522, %f523}; - bar.sync 0; - // inline asm - mov.u32 %r60, %tid.x; - // inline asm - setp.eq.s32 %p1, %r60, 0; - @%p1 bra BB0_2; - - // inline asm - mov.u32 %r61, %tid.x; - // inline asm - mov.u32 %r62, 512; - sub.s32 %r63, %r62, %r61; - shl.b32 %r64, %r63, 3; - add.s32 %r66, %r52, %r64; - ld.shared.v2.f32 {%f572, %f573}, [%r66]; - bra.uni BB0_3; - -BB0_2: - mov.f32 %f17, 0f00000000; - mov.f32 %f572, %f575; - mov.f32 %f573, %f17; - mov.f32 %f574, %f574; - mov.f32 %f575, %f17; - -BB0_3: - // inline asm - mov.u32 %r67, %tid.x; - // inline asm - mov.u32 %r70, 384; - sub.s32 %r71, %r70, %r67; - shl.b32 %r72, %r71, 3; - add.s32 %r74, %r52, %r72; - ld.shared.v2.f32 {%f490, %f491}, [%r74]; - // inline asm - mov.u32 %r68, %tid.x; - // inline asm - mov.u32 %r75, 256; - sub.s32 %r76, %r75, %r68; - shl.b32 %r77, %r76, 3; - add.s32 %r78, %r52, %r77; - ld.shared.v2.f32 {%f492, %f493}, [%r78]; - // inline asm - mov.u32 %r69, %tid.x; - // inline asm - mov.u32 %r79, 128; - sub.s32 %r80, %r79, %r69; - shl.b32 %r81, %r80, 3; - add.s32 %r82, %r52, %r81; - neg.ftz.f32 %f20, %f573; - neg.ftz.f32 %f22, %f491; - neg.ftz.f32 %f24, %f493; - ld.shared.v2.f32 {%f500, %f501}, [%r82]; - neg.ftz.f32 %f26, %f501; - add.ftz.f32 %f504, %f574, %f572; - add.ftz.f32 %f505, %f575, %f20; - sub.ftz.f32 %f506, %f574, %f572; - sub.ftz.f32 %f507, %f575, %f20; - add.ftz.f32 %f508, %f510, %f492; - add.ftz.f32 %f509, %f511, %f24; - sub.ftz.f32 %f512, %f510, %f492; - sub.ftz.f32 %f513, %f511, %f24; - add.ftz.f32 %f514, %f516, %f490; - add.ftz.f32 %f515, %f517, %f22; - sub.ftz.f32 %f518, %f516, %f490; - sub.ftz.f32 %f519, %f517, %f22; - add.ftz.f32 %f520, %f522, %f500; - add.ftz.f32 %f521, %f523, %f26; - sub.ftz.f32 %f524, %f522, %f500; - sub.ftz.f32 %f525, %f523, %f26; - add.ftz.f32 %f526, %f504, %f508; - add.ftz.f32 %f527, %f505, %f509; - sub.ftz.f32 %f528, %f504, %f508; - sub.ftz.f32 %f529, %f505, %f509; - neg.ftz.f32 %f28, %f513; - add.ftz.f32 %f534, %f506, %f28; - add.ftz.f32 %f535, %f507, %f512; - sub.ftz.f32 %f536, %f506, %f28; - sub.ftz.f32 %f537, %f507, %f512; - add.ftz.f32 %f538, %f514, %f520; - add.ftz.f32 %f539, %f515, %f521; - sub.ftz.f32 %f540, %f514, %f520; - sub.ftz.f32 %f541, %f515, %f521; - neg.ftz.f32 %f31, %f525; - add.ftz.f32 %f546, %f518, %f31; - add.ftz.f32 %f547, %f519, %f524; - sub.ftz.f32 %f548, %f518, %f31; - sub.ftz.f32 %f549, %f519, %f524; - add.ftz.f32 %f446, %f526, %f538; - add.ftz.f32 %f447, %f527, %f539; - sub.ftz.f32 %f454, %f526, %f538; - sub.ftz.f32 %f455, %f527, %f539; - neg.ftz.f32 %f34, %f547; - add.ftz.f32 %f554, %f34, %f546; - add.ftz.f32 %f555, %f546, %f547; - mov.f32 %f36, 0f3F3504F3; - fma.rn.ftz.f32 %f448, %f554, %f36, %f534; - fma.rn.ftz.f32 %f449, %f555, %f36, %f535; - neg.f32 %f558, %f554; - neg.f32 %f559, %f555; - fma.rn.ftz.f32 %f456, %f558, %f36, %f534; - fma.rn.ftz.f32 %f457, %f559, %f36, %f535; - neg.ftz.f32 %f38, %f541; - add.ftz.f32 %f450, %f528, %f38; - add.ftz.f32 %f451, %f529, %f540; - sub.ftz.f32 %f458, %f528, %f38; - sub.ftz.f32 %f459, %f529, %f540; - neg.ftz.f32 %f41, %f549; - sub.ftz.f32 %f568, %f41, %f548; - sub.ftz.f32 %f569, %f548, %f549; - fma.rn.ftz.f32 %f452, %f568, %f36, %f536; - fma.rn.ftz.f32 %f453, %f569, %f36, %f537; - neg.f32 %f570, %f568; - neg.f32 %f571, %f569; - fma.rn.ftz.f32 %f460, %f570, %f36, %f536; - fma.rn.ftz.f32 %f461, %f571, %f36, %f537; - bar.sync 0; - // inline asm - mov.u32 %r83, %tid.x; - // inline asm - shl.b32 %r84, %r83, 5; - add.s32 %r86, %r52, %r84; - st.shared.v4.f32 [%r86+16], {%f454, %f456, %f458, %f460}; - st.shared.v4.f32 [%r86], {%f446, %f448, %f450, %f452}; - bar.sync 0; - // inline asm - mov.u32 %r87, %tid.x; - // inline asm - shl.b32 %r95, %r87, 2; - add.s32 %r97, %r52, %r95; - ld.shared.f32 %f51, [%r97]; - // inline asm - mov.u32 %r88, %tid.x; - // inline asm - shl.b32 %r98, %r88, 2; - add.s32 %r99, %r98, %r52; - ld.shared.f32 %f1, [%r99+512]; - // inline asm - mov.u32 %r89, %tid.x; - // inline asm - shl.b32 %r100, %r89, 2; - add.s32 %r101, %r100, %r52; - ld.shared.f32 %f2, [%r101+1024]; - // inline asm - mov.u32 %r90, %tid.x; - // inline asm - shl.b32 %r102, %r90, 2; - add.s32 %r103, %r102, %r52; - ld.shared.f32 %f3, [%r103+1536]; - // inline asm - mov.u32 %r91, %tid.x; - // inline asm - shl.b32 %r104, %r91, 2; - add.s32 %r105, %r104, %r52; - ld.shared.f32 %f4, [%r105+2048]; - // inline asm - mov.u32 %r92, %tid.x; - // inline asm - shl.b32 %r106, %r92, 2; - add.s32 %r107, %r106, %r52; - ld.shared.f32 %f5, [%r107+2560]; - // inline asm - mov.u32 %r93, %tid.x; - // inline asm - shl.b32 %r108, %r93, 2; - add.s32 %r109, %r108, %r52; - ld.shared.f32 %f6, [%r109+3072]; - // inline asm - mov.u32 %r94, %tid.x; - // inline asm - shl.b32 %r110, %r94, 2; - add.s32 %r111, %r110, %r52; - ld.shared.f32 %f7, [%r111+3584]; - bar.sync 0; - // inline asm - mov.u32 %r112, %tid.x; - // inline asm - shl.b32 %r113, %r112, 5; - add.s32 %r115, %r52, %r113; - st.shared.v4.f32 [%r115+16], {%f455, %f457, %f459, %f461}; - st.shared.v4.f32 [%r115], {%f447, %f449, %f451, %f453}; - bar.sync 0; - // inline asm - mov.u32 %r116, %tid.x; - // inline asm - shl.b32 %r124, %r116, 2; - add.s32 %r126, %r52, %r124; - ld.shared.f32 %f61, [%r126]; - // inline asm - mov.u32 %r117, %tid.x; - // inline asm - shl.b32 %r127, %r117, 2; - add.s32 %r128, %r127, %r52; - ld.shared.f32 %f8, [%r128+512]; - // inline asm - mov.u32 %r118, %tid.x; - // inline asm - shl.b32 %r129, %r118, 2; - add.s32 %r130, %r129, %r52; - ld.shared.f32 %f9, [%r130+1024]; - // inline asm - mov.u32 %r119, %tid.x; - // inline asm - shl.b32 %r131, %r119, 2; - add.s32 %r132, %r131, %r52; - ld.shared.f32 %f10, [%r132+1536]; - // inline asm - mov.u32 %r120, %tid.x; - // inline asm - shl.b32 %r133, %r120, 2; - add.s32 %r134, %r133, %r52; - ld.shared.f32 %f11, [%r134+2048]; - // inline asm - mov.u32 %r121, %tid.x; - // inline asm - shl.b32 %r135, %r121, 2; - add.s32 %r136, %r135, %r52; - ld.shared.f32 %f12, [%r136+2560]; - // inline asm - mov.u32 %r122, %tid.x; - // inline asm - shl.b32 %r137, %r122, 2; - add.s32 %r138, %r137, %r52; - ld.shared.f32 %f13, [%r138+3072]; - // inline asm - mov.u32 %r123, %tid.x; - // inline asm - shl.b32 %r139, %r123, 2; - add.s32 %r140, %r139, %r52; - ld.shared.f32 %f14, [%r140+3584]; - bar.sync 0; - mov.f32 %f90, 0f40000000; - mul.ftz.f32 %f15, %f90, 0f40490FDB; - div.rn.ftz.f32 %f91, %f15, 0f42800000; - // inline asm - mov.u32 %r141, %tid.x; - // inline asm - and.b32 %r144, %r141, 7; - cvt.rn.f32.u32 %f92, %r144; - mul.ftz.f32 %f65, %f91, %f92; - // inline asm - cos.approx.f32 %f62, %f65; - // inline asm - // inline asm - sin.approx.f32 %f64, %f65; - // inline asm - mul.ftz.f32 %f93, %f62, %f1; - neg.f32 %f94, %f64; - fma.rn.ftz.f32 %f95, %f94, %f8, %f93; - mul.ftz.f32 %f96, %f64, %f1; - fma.rn.ftz.f32 %f97, %f62, %f8, %f96; - fma.rn.ftz.f32 %f67, %f91, %f92, %f65; - // inline asm - cos.approx.f32 %f66, %f67; - // inline asm - // inline asm - sin.approx.f32 %f68, %f67; - // inline asm - mul.ftz.f32 %f98, %f66, %f2; - neg.f32 %f99, %f68; - fma.rn.ftz.f32 %f100, %f99, %f9, %f98; - mul.ftz.f32 %f101, %f68, %f2; - fma.rn.ftz.f32 %f102, %f66, %f9, %f101; - mul.ftz.f32 %f73, %f65, 0f40400000; - // inline asm - cos.approx.f32 %f70, %f73; - // inline asm - // inline asm - sin.approx.f32 %f72, %f73; - // inline asm - mul.ftz.f32 %f103, %f70, %f3; - neg.f32 %f104, %f72; - fma.rn.ftz.f32 %f105, %f104, %f10, %f103; - mul.ftz.f32 %f106, %f72, %f3; - fma.rn.ftz.f32 %f107, %f70, %f10, %f106; - mul.ftz.f32 %f77, %f65, 0f40800000; - // inline asm - cos.approx.f32 %f74, %f77; - // inline asm - // inline asm - sin.approx.f32 %f76, %f77; - // inline asm - mul.ftz.f32 %f108, %f74, %f4; - neg.f32 %f109, %f76; - fma.rn.ftz.f32 %f110, %f109, %f11, %f108; - mul.ftz.f32 %f111, %f76, %f4; - fma.rn.ftz.f32 %f112, %f74, %f11, %f111; - mul.ftz.f32 %f81, %f65, 0f40A00000; - // inline asm - cos.approx.f32 %f78, %f81; - // inline asm - // inline asm - sin.approx.f32 %f80, %f81; - // inline asm - mul.ftz.f32 %f113, %f78, %f5; - neg.f32 %f114, %f80; - fma.rn.ftz.f32 %f115, %f114, %f12, %f113; - mul.ftz.f32 %f116, %f80, %f5; - fma.rn.ftz.f32 %f117, %f78, %f12, %f116; - mul.ftz.f32 %f85, %f65, 0f40C00000; - // inline asm - cos.approx.f32 %f82, %f85; - // inline asm - // inline asm - sin.approx.f32 %f84, %f85; - // inline asm - mul.ftz.f32 %f118, %f82, %f6; - neg.f32 %f119, %f84; - fma.rn.ftz.f32 %f120, %f119, %f13, %f118; - mul.ftz.f32 %f121, %f84, %f6; - fma.rn.ftz.f32 %f122, %f82, %f13, %f121; - mul.ftz.f32 %f89, %f65, 0f40E00000; - // inline asm - cos.approx.f32 %f86, %f89; - // inline asm - // inline asm - sin.approx.f32 %f88, %f89; - // inline asm - mul.ftz.f32 %f123, %f86, %f7; - neg.f32 %f124, %f88; - fma.rn.ftz.f32 %f125, %f124, %f14, %f123; - mul.ftz.f32 %f126, %f88, %f7; - fma.rn.ftz.f32 %f127, %f86, %f14, %f126; - add.ftz.f32 %f380, %f51, %f110; - add.ftz.f32 %f381, %f61, %f112; - sub.ftz.f32 %f384, %f51, %f110; - sub.ftz.f32 %f385, %f61, %f112; - add.ftz.f32 %f386, %f100, %f120; - add.ftz.f32 %f387, %f102, %f122; - sub.ftz.f32 %f388, %f100, %f120; - sub.ftz.f32 %f389, %f102, %f122; - add.ftz.f32 %f390, %f95, %f115; - add.ftz.f32 %f391, %f97, %f117; - sub.ftz.f32 %f392, %f95, %f115; - sub.ftz.f32 %f393, %f97, %f117; - add.ftz.f32 %f394, %f105, %f125; - add.ftz.f32 %f395, %f107, %f127; - sub.ftz.f32 %f396, %f105, %f125; - sub.ftz.f32 %f397, %f107, %f127; - add.ftz.f32 %f398, %f380, %f386; - add.ftz.f32 %f399, %f381, %f387; - sub.ftz.f32 %f400, %f380, %f386; - sub.ftz.f32 %f401, %f381, %f387; - neg.ftz.f32 %f129, %f389; - add.ftz.f32 %f406, %f384, %f129; - add.ftz.f32 %f407, %f385, %f388; - sub.ftz.f32 %f408, %f384, %f129; - sub.ftz.f32 %f409, %f385, %f388; - add.ftz.f32 %f410, %f390, %f394; - add.ftz.f32 %f411, %f391, %f395; - sub.ftz.f32 %f412, %f390, %f394; - sub.ftz.f32 %f413, %f391, %f395; - neg.ftz.f32 %f132, %f397; - add.ftz.f32 %f418, %f392, %f132; - add.ftz.f32 %f419, %f393, %f396; - sub.ftz.f32 %f420, %f392, %f132; - sub.ftz.f32 %f421, %f393, %f396; - add.ftz.f32 %f348, %f398, %f410; - add.ftz.f32 %f349, %f399, %f411; - sub.ftz.f32 %f356, %f398, %f410; - sub.ftz.f32 %f357, %f399, %f411; - neg.ftz.f32 %f135, %f419; - add.ftz.f32 %f426, %f135, %f418; - add.ftz.f32 %f427, %f418, %f419; - fma.rn.ftz.f32 %f350, %f426, %f36, %f406; - fma.rn.ftz.f32 %f351, %f427, %f36, %f407; - neg.f32 %f430, %f426; - neg.f32 %f431, %f427; - fma.rn.ftz.f32 %f358, %f430, %f36, %f406; - fma.rn.ftz.f32 %f359, %f431, %f36, %f407; - neg.ftz.f32 %f139, %f413; - add.ftz.f32 %f352, %f400, %f139; - add.ftz.f32 %f353, %f401, %f412; - sub.ftz.f32 %f360, %f400, %f139; - sub.ftz.f32 %f361, %f401, %f412; - neg.ftz.f32 %f142, %f421; - sub.ftz.f32 %f440, %f142, %f420; - sub.ftz.f32 %f441, %f420, %f421; - fma.rn.ftz.f32 %f354, %f440, %f36, %f408; - fma.rn.ftz.f32 %f355, %f441, %f36, %f409; - neg.f32 %f442, %f440; - neg.f32 %f443, %f441; - fma.rn.ftz.f32 %f362, %f442, %f36, %f408; - fma.rn.ftz.f32 %f363, %f443, %f36, %f409; - // inline asm - mov.u32 %r142, %tid.x; - // inline asm - shl.b32 %r145, %r142, 3; - // inline asm - mov.u32 %r143, %tid.x; - // inline asm - and.b32 %r146, %r143, 7; - and.b32 %r147, %r145, 1073741760; - add.s32 %r148, %r147, %r146; - shl.b32 %r149, %r148, 2; - add.s32 %r1, %r52, %r149; - st.shared.f32 [%r1], %f348; - st.shared.f32 [%r1+32], %f350; - st.shared.f32 [%r1+64], %f352; - st.shared.f32 [%r1+96], %f354; - st.shared.f32 [%r1+128], %f356; - st.shared.f32 [%r1+160], %f358; - st.shared.f32 [%r1+192], %f360; - st.shared.f32 [%r1+224], %f362; - bar.sync 0; - // inline asm - mov.u32 %r151, %tid.x; - // inline asm - shl.b32 %r155, %r151, 3; - add.s32 %r157, %r52, %r155; - ld.shared.v2.f32 {%f344, %f345}, [%r157]; - // inline asm - mov.u32 %r152, %tid.x; - // inline asm - shl.b32 %r158, %r152, 3; - add.s32 %r159, %r158, %r52; - ld.shared.v2.f32 {%f282, %f283}, [%r159+1024]; - // inline asm - mov.u32 %r153, %tid.x; - // inline asm - shl.b32 %r160, %r153, 3; - add.s32 %r161, %r160, %r52; - ld.shared.v2.f32 {%f288, %f289}, [%r161+2048]; - // inline asm - mov.u32 %r154, %tid.x; - // inline asm - shl.b32 %r162, %r154, 3; - add.s32 %r163, %r162, %r52; - ld.shared.v2.f32 {%f294, %f295}, [%r163+3072]; - bar.sync 0; - st.shared.f32 [%r1], %f349; - st.shared.f32 [%r1+32], %f351; - st.shared.f32 [%r1+64], %f353; - st.shared.f32 [%r1+96], %f355; - st.shared.f32 [%r1+128], %f357; - st.shared.f32 [%r1+160], %f359; - st.shared.f32 [%r1+192], %f361; - st.shared.f32 [%r1+224], %f363; - bar.sync 0; - // inline asm - mov.u32 %r164, %tid.x; - // inline asm - shl.b32 %r168, %r164, 3; - add.s32 %r170, %r52, %r168; - ld.shared.v2.f32 {%f342, %f343}, [%r170]; - // inline asm - mov.u32 %r165, %tid.x; - // inline asm - shl.b32 %r171, %r165, 3; - add.s32 %r172, %r171, %r52; - ld.shared.v2.f32 {%f284, %f285}, [%r172+1024]; - // inline asm - mov.u32 %r166, %tid.x; - // inline asm - shl.b32 %r173, %r166, 3; - add.s32 %r174, %r173, %r52; - ld.shared.v2.f32 {%f290, %f291}, [%r174+2048]; - // inline asm - mov.u32 %r167, %tid.x; - // inline asm - shl.b32 %r175, %r167, 3; - add.s32 %r176, %r175, %r52; - ld.shared.v2.f32 {%f296, %f297}, [%r176+3072]; - bar.sync 0; - div.rn.ftz.f32 %f184, %f15, 0f43800000; - // inline asm - mov.u32 %r177, %tid.x; - // inline asm - shl.b32 %r180, %r177, 1; - and.b32 %r181, %r180, 62; - cvt.rn.f32.u32 %f185, %r181; - mul.ftz.f32 %f163, %f184, %f185; - // inline asm - cos.approx.f32 %f160, %f163; - // inline asm - // inline asm - sin.approx.f32 %f162, %f163; - // inline asm - mul.ftz.f32 %f187, %f160, %f282; - neg.f32 %f189, %f162; - fma.rn.ftz.f32 %f190, %f189, %f284, %f187; - mul.ftz.f32 %f191, %f162, %f282; - fma.rn.ftz.f32 %f192, %f160, %f284, %f191; - fma.rn.ftz.f32 %f165, %f184, %f185, %f163; - // inline asm - cos.approx.f32 %f164, %f165; - // inline asm - // inline asm - sin.approx.f32 %f166, %f165; - // inline asm - mul.ftz.f32 %f194, %f164, %f288; - neg.f32 %f196, %f166; - fma.rn.ftz.f32 %f197, %f196, %f290, %f194; - mul.ftz.f32 %f198, %f166, %f288; - fma.rn.ftz.f32 %f199, %f164, %f290, %f198; - mul.ftz.f32 %f171, %f163, 0f40400000; - // inline asm - cos.approx.f32 %f168, %f171; - // inline asm - // inline asm - sin.approx.f32 %f170, %f171; - // inline asm - mul.ftz.f32 %f201, %f168, %f294; - neg.f32 %f203, %f170; - fma.rn.ftz.f32 %f204, %f203, %f296, %f201; - mul.ftz.f32 %f205, %f170, %f294; - fma.rn.ftz.f32 %f206, %f168, %f296, %f205; - fma.rn.ftz.f32 %f175, %f184, %f185, %f184; - // inline asm - cos.approx.f32 %f172, %f175; - // inline asm - // inline asm - sin.approx.f32 %f174, %f175; - // inline asm - mul.ftz.f32 %f208, %f172, %f283; - neg.f32 %f210, %f174; - fma.rn.ftz.f32 %f211, %f210, %f285, %f208; - mul.ftz.f32 %f212, %f174, %f283; - fma.rn.ftz.f32 %f213, %f172, %f285, %f212; - add.ftz.f32 %f177, %f175, %f175; - // inline asm - cos.approx.f32 %f176, %f177; - // inline asm - // inline asm - sin.approx.f32 %f178, %f177; - // inline asm - mul.ftz.f32 %f215, %f176, %f289; - neg.f32 %f217, %f178; - fma.rn.ftz.f32 %f218, %f217, %f291, %f215; - mul.ftz.f32 %f219, %f178, %f289; - fma.rn.ftz.f32 %f220, %f176, %f291, %f219; - mul.ftz.f32 %f183, %f175, 0f40400000; - // inline asm - cos.approx.f32 %f180, %f183; - // inline asm - // inline asm - sin.approx.f32 %f182, %f183; - // inline asm - mul.ftz.f32 %f222, %f180, %f295; - neg.f32 %f224, %f182; - fma.rn.ftz.f32 %f225, %f224, %f297, %f222; - mul.ftz.f32 %f226, %f182, %f295; - fma.rn.ftz.f32 %f227, %f180, %f297, %f226; - add.ftz.f32 %f306, %f344, %f197; - add.ftz.f32 %f307, %f342, %f199; - sub.ftz.f32 %f310, %f344, %f197; - sub.ftz.f32 %f311, %f342, %f199; - add.ftz.f32 %f312, %f190, %f204; - add.ftz.f32 %f313, %f192, %f206; - sub.ftz.f32 %f314, %f190, %f204; - sub.ftz.f32 %f315, %f192, %f206; - add.ftz.f32 %f260, %f306, %f312; - add.ftz.f32 %f261, %f307, %f313; - sub.ftz.f32 %f272, %f306, %f312; - sub.ftz.f32 %f273, %f307, %f313; - neg.ftz.f32 %f229, %f315; - add.ftz.f32 %f266, %f310, %f229; - add.ftz.f32 %f267, %f311, %f314; - sub.ftz.f32 %f278, %f310, %f229; - sub.ftz.f32 %f279, %f311, %f314; - add.ftz.f32 %f320, %f345, %f218; - add.ftz.f32 %f321, %f343, %f220; - sub.ftz.f32 %f324, %f345, %f218; - sub.ftz.f32 %f325, %f343, %f220; - add.ftz.f32 %f326, %f211, %f225; - add.ftz.f32 %f327, %f213, %f227; - sub.ftz.f32 %f328, %f211, %f225; - sub.ftz.f32 %f329, %f213, %f227; - add.ftz.f32 %f262, %f320, %f326; - add.ftz.f32 %f263, %f321, %f327; - sub.ftz.f32 %f274, %f320, %f326; - sub.ftz.f32 %f275, %f321, %f327; - neg.ftz.f32 %f232, %f329; - add.ftz.f32 %f268, %f324, %f232; - add.ftz.f32 %f269, %f325, %f328; - sub.ftz.f32 %f280, %f324, %f232; - sub.ftz.f32 %f281, %f325, %f328; - // inline asm - mov.u32 %r178, %tid.x; - // inline asm - shl.b32 %r182, %r178, 2; - // inline asm - mov.u32 %r179, %tid.x; - // inline asm - and.b32 %r183, %r179, 31; - and.b32 %r184, %r182, 536870784; - add.s32 %r185, %r184, %r183; - shl.b32 %r186, %r185, 3; - add.s32 %r188, %r52, %r186; - st.shared.v2.f32 [%r188], {%f260, %f262}; - st.shared.v2.f32 [%r188+256], {%f266, %f268}; - st.shared.v2.f32 [%r188+512], {%f272, %f274}; - st.shared.v2.f32 [%r188+768], {%f278, %f280}; - bar.sync 0; - // inline asm - mov.u32 %r193, %tid.x; - // inline asm - shl.b32 %r197, %r193, 3; - add.s32 %r199, %r52, %r197; - st.shared.v2.f32 [%r199], {%f261, %f263}; - // inline asm - mov.u32 %r194, %tid.x; - // inline asm - shl.b32 %r200, %r194, 3; - add.s32 %r201, %r200, %r52; - st.shared.v2.f32 [%r201+1024], {%f267, %f269}; - // inline asm - mov.u32 %r195, %tid.x; - // inline asm - shl.b32 %r202, %r195, 3; - add.s32 %r203, %r202, %r52; - st.shared.v2.f32 [%r203+2048], {%f273, %f275}; - // inline asm - mov.u32 %r196, %tid.x; - // inline asm - shl.b32 %r204, %r196, 3; - add.s32 %r205, %r204, %r52; - st.shared.v2.f32 [%r205+3072], {%f279, %f281}; - bar.sync 0; - ret; -} - - diff --git a/RTCP/GPUProc/src/UHEP/InvFIR.cl-0.ptx b/RTCP/GPUProc/src/UHEP/InvFIR.cl-0.ptx deleted file mode 100644 index 2179b103dc77531446838556584310d90c722f7b..0000000000000000000000000000000000000000 --- a/RTCP/GPUProc/src/UHEP/InvFIR.cl-0.ptx +++ /dev/null @@ -1,447 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Tue Feb 7 07:15:59 2012 (1328595359) -// Driver 295.20 -// - -.version 3.0 -.target sm_21, texmode_independent -.address_size 32 - - -.entry invFIRfilter( - .param .u32 .ptr .global .align 1 invFIRfilter_param_0, - .param .u32 .ptr .global .align 1 invFIRfilter_param_1, - .param .u32 .ptr .global .align 1 invFIRfilter_param_2 -) -{ - .reg .f32 %f<386>; - .reg .pred %p<2>; - .reg .s32 %r<71>; - - - ld.param.u32 %r21, [invFIRfilter_param_0]; - ld.param.u32 %r22, [invFIRfilter_param_1]; - ld.param.u32 %r23, [invFIRfilter_param_2]; - // inline asm - mov.u32 %r7, %envreg3; - // inline asm - // inline asm - mov.u32 %r8, %ntid.x; - // inline asm - // inline asm - mov.u32 %r9, %ctaid.x; - // inline asm - mul.lo.s32 %r24, %r9, %r8; - // inline asm - mov.u32 %r10, %tid.x; - // inline asm - add.s32 %r25, %r10, %r7; - mad.lo.s32 %r26, %r9, %r8, %r25; - // inline asm - mov.u32 %r11, %envreg4; - // inline asm - // inline asm - mov.u32 %r12, %ntid.y; - // inline asm - // inline asm - mov.u32 %r13, %ctaid.y; - // inline asm - mul.lo.s32 %r27, %r13, %r12; - // inline asm - mov.u32 %r14, %tid.y; - // inline asm - add.s32 %r28, %r14, %r11; - mad.lo.s32 %r29, %r13, %r12, %r28; - // inline asm - mov.u32 %r15, %envreg5; - // inline asm - // inline asm - mov.u32 %r16, %ntid.z; - // inline asm - // inline asm - mov.u32 %r17, %ctaid.z; - // inline asm - mul.lo.s32 %r30, %r17, %r16; - // inline asm - mov.u32 %r18, %tid.z; - // inline asm - add.s32 %r31, %r18, %r15; - mad.lo.s32 %r32, %r17, %r16, %r31; - shl.b32 %r33, %r26, 6; - add.s32 %r34, %r23, %r33; - mad.lo.s32 %r35, %r32, 8511488, %r22; - mad.lo.s32 %r36, %r29, 4255744, %r35; - shl.b32 %r37, %r26, 2; - add.s32 %r38, %r36, %r37; - ld.global.f32 %f17, [%r38]; - mov.u32 %r70, 0; - ld.global.f32 %f18, [%r38+4096]; - ld.global.f32 %f19, [%r38+8192]; - ld.global.f32 %f20, [%r38+12288]; - ld.global.f32 %f21, [%r38+16384]; - ld.global.f32 %f22, [%r38+20480]; - ld.global.f32 %f23, [%r38+24576]; - ld.global.f32 %f24, [%r38+28672]; - ld.global.f32 %f25, [%r38+32768]; - ld.global.f32 %f26, [%r38+36864]; - ld.global.f32 %f27, [%r38+40960]; - ld.global.f32 %f28, [%r38+45056]; - ld.global.f32 %f29, [%r38+49152]; - ld.global.f32 %f30, [%r38+53248]; - ld.global.f32 %f31, [%r38+57344]; - ld.global.v4.f32 {%f367, %f368, %f369, %f370}, [%r34+48]; - ld.global.v4.f32 {%f371, %f372, %f373, %f374}, [%r34+32]; - ld.global.v4.f32 {%f375, %f376, %f377, %f378}, [%r34+16]; - ld.global.v4.f32 {%f379, %f380, %f381, %f382}, [%r34]; - shl.b32 %r39, %r30, 23; - shl.b32 %r40, %r15, 23; - add.s32 %r41, %r39, %r40; - shl.b32 %r42, %r18, 23; - add.s32 %r43, %r41, %r42; - shl.b32 %r44, %r27, 22; - add.s32 %r45, %r43, %r44; - shl.b32 %r46, %r11, 22; - add.s32 %r47, %r45, %r46; - shl.b32 %r48, %r14, 22; - add.s32 %r49, %r47, %r48; - shl.b32 %r50, %r24, 2; - add.s32 %r51, %r49, %r50; - shl.b32 %r52, %r7, 2; - add.s32 %r53, %r51, %r52; - shl.b32 %r54, %r10, 2; - add.s32 %r55, %r53, %r54; - add.s32 %r56, %r55, %r21; - add.s32 %r1, %r56, 32768; - mul.lo.s32 %r57, %r15, 8511488; - mad.lo.s32 %r58, %r30, 8511488, %r57; - mad.lo.s32 %r59, %r18, 8511488, %r58; - mad.lo.s32 %r60, %r27, 4255744, %r59; - mad.lo.s32 %r61, %r11, 4255744, %r60; - mad.lo.s32 %r62, %r14, 4255744, %r61; - add.s32 %r63, %r62, %r50; - add.s32 %r64, %r63, %r52; - add.s32 %r65, %r64, %r54; - add.s32 %r66, %r65, %r22; - add.s32 %r2, %r66, 122880; - mov.f32 %f347, %f25; - mov.f32 %f348, %f26; - mov.f32 %f349, %f27; - mov.f32 %f350, %f28; - mov.f32 %f335, %f21; - mov.f32 %f336, %f22; - mov.f32 %f337, %f23; - mov.f32 %f338, %f24; - mov.f32 %f323, %f17; - mov.f32 %f324, %f18; - mov.f32 %f325, %f19; - mov.f32 %f326, %f20; - mov.f32 %f383, %f29; - mov.f32 %f384, %f30; - mov.f32 %f385, %f31; - mov.f32 %f362, %f32; - mov.u32 %r69, 64; - -BB0_1: - mul.ftz.f32 %f35, %f369, %f324; - fma.rn.ftz.f32 %f36, %f370, %f323, %f35; - fma.rn.ftz.f32 %f38, %f368, %f325, %f36; - fma.rn.ftz.f32 %f40, %f367, %f326, %f38; - fma.rn.ftz.f32 %f42, %f374, %f335, %f40; - fma.rn.ftz.f32 %f44, %f373, %f336, %f42; - fma.rn.ftz.f32 %f46, %f372, %f337, %f44; - fma.rn.ftz.f32 %f48, %f371, %f338, %f46; - fma.rn.ftz.f32 %f50, %f378, %f347, %f48; - fma.rn.ftz.f32 %f52, %f377, %f348, %f50; - fma.rn.ftz.f32 %f54, %f376, %f349, %f52; - fma.rn.ftz.f32 %f56, %f375, %f350, %f54; - fma.rn.ftz.f32 %f58, %f382, %f383, %f56; - fma.rn.ftz.f32 %f60, %f381, %f384, %f58; - fma.rn.ftz.f32 %f62, %f380, %f385, %f60; - add.s32 %r67, %r2, %r70; - ld.global.f32 %f63, [%r67+-57344]; - fma.rn.ftz.f32 %f64, %f379, %f63, %f62; - add.s32 %r68, %r1, %r70; - st.global.f32 [%r68+-32768], %f64; - mul.ftz.f32 %f65, %f369, %f325; - fma.rn.ftz.f32 %f66, %f370, %f324, %f65; - fma.rn.ftz.f32 %f67, %f368, %f326, %f66; - fma.rn.ftz.f32 %f68, %f367, %f335, %f67; - fma.rn.ftz.f32 %f69, %f374, %f336, %f68; - fma.rn.ftz.f32 %f70, %f373, %f337, %f69; - fma.rn.ftz.f32 %f71, %f372, %f338, %f70; - fma.rn.ftz.f32 %f72, %f371, %f347, %f71; - fma.rn.ftz.f32 %f73, %f378, %f348, %f72; - fma.rn.ftz.f32 %f74, %f377, %f349, %f73; - fma.rn.ftz.f32 %f75, %f376, %f350, %f74; - fma.rn.ftz.f32 %f76, %f375, %f383, %f75; - fma.rn.ftz.f32 %f77, %f382, %f384, %f76; - fma.rn.ftz.f32 %f78, %f381, %f385, %f77; - ld.global.f32 %f79, [%r67+-53248]; - fma.rn.ftz.f32 %f80, %f380, %f79, %f78; - fma.rn.ftz.f32 %f81, %f379, %f323, %f80; - st.global.f32 [%r68+-28672], %f81; - mul.ftz.f32 %f82, %f369, %f326; - fma.rn.ftz.f32 %f83, %f370, %f325, %f82; - fma.rn.ftz.f32 %f84, %f368, %f335, %f83; - fma.rn.ftz.f32 %f85, %f367, %f336, %f84; - fma.rn.ftz.f32 %f86, %f374, %f337, %f85; - fma.rn.ftz.f32 %f87, %f373, %f338, %f86; - fma.rn.ftz.f32 %f88, %f372, %f347, %f87; - fma.rn.ftz.f32 %f89, %f371, %f348, %f88; - fma.rn.ftz.f32 %f90, %f378, %f349, %f89; - fma.rn.ftz.f32 %f91, %f377, %f350, %f90; - fma.rn.ftz.f32 %f92, %f376, %f383, %f91; - fma.rn.ftz.f32 %f93, %f375, %f384, %f92; - fma.rn.ftz.f32 %f94, %f382, %f385, %f93; - ld.global.f32 %f95, [%r67+-49152]; - fma.rn.ftz.f32 %f96, %f381, %f95, %f94; - fma.rn.ftz.f32 %f97, %f380, %f323, %f96; - fma.rn.ftz.f32 %f98, %f379, %f324, %f97; - st.global.f32 [%r68+-24576], %f98; - mul.ftz.f32 %f99, %f369, %f335; - fma.rn.ftz.f32 %f100, %f370, %f326, %f99; - fma.rn.ftz.f32 %f101, %f368, %f336, %f100; - fma.rn.ftz.f32 %f102, %f367, %f337, %f101; - fma.rn.ftz.f32 %f103, %f374, %f338, %f102; - fma.rn.ftz.f32 %f104, %f373, %f347, %f103; - fma.rn.ftz.f32 %f105, %f372, %f348, %f104; - fma.rn.ftz.f32 %f106, %f371, %f349, %f105; - fma.rn.ftz.f32 %f107, %f378, %f350, %f106; - fma.rn.ftz.f32 %f108, %f377, %f383, %f107; - fma.rn.ftz.f32 %f109, %f376, %f384, %f108; - fma.rn.ftz.f32 %f110, %f375, %f385, %f109; - ld.global.f32 %f111, [%r67+-45056]; - fma.rn.ftz.f32 %f112, %f382, %f111, %f110; - fma.rn.ftz.f32 %f113, %f381, %f323, %f112; - fma.rn.ftz.f32 %f114, %f380, %f324, %f113; - fma.rn.ftz.f32 %f115, %f379, %f325, %f114; - st.global.f32 [%r68+-20480], %f115; - mul.ftz.f32 %f116, %f369, %f336; - fma.rn.ftz.f32 %f117, %f370, %f335, %f116; - fma.rn.ftz.f32 %f118, %f368, %f337, %f117; - fma.rn.ftz.f32 %f119, %f367, %f338, %f118; - fma.rn.ftz.f32 %f120, %f374, %f347, %f119; - fma.rn.ftz.f32 %f121, %f373, %f348, %f120; - fma.rn.ftz.f32 %f122, %f372, %f349, %f121; - fma.rn.ftz.f32 %f123, %f371, %f350, %f122; - fma.rn.ftz.f32 %f124, %f378, %f383, %f123; - fma.rn.ftz.f32 %f125, %f377, %f384, %f124; - fma.rn.ftz.f32 %f126, %f376, %f385, %f125; - ld.global.f32 %f127, [%r67+-40960]; - fma.rn.ftz.f32 %f128, %f375, %f127, %f126; - fma.rn.ftz.f32 %f129, %f382, %f323, %f128; - fma.rn.ftz.f32 %f130, %f381, %f324, %f129; - fma.rn.ftz.f32 %f131, %f380, %f325, %f130; - fma.rn.ftz.f32 %f132, %f379, %f326, %f131; - st.global.f32 [%r68+-16384], %f132; - mul.ftz.f32 %f133, %f369, %f337; - fma.rn.ftz.f32 %f134, %f370, %f336, %f133; - fma.rn.ftz.f32 %f135, %f368, %f338, %f134; - fma.rn.ftz.f32 %f136, %f367, %f347, %f135; - fma.rn.ftz.f32 %f137, %f374, %f348, %f136; - fma.rn.ftz.f32 %f138, %f373, %f349, %f137; - fma.rn.ftz.f32 %f139, %f372, %f350, %f138; - fma.rn.ftz.f32 %f140, %f371, %f383, %f139; - fma.rn.ftz.f32 %f141, %f378, %f384, %f140; - fma.rn.ftz.f32 %f142, %f377, %f385, %f141; - ld.global.f32 %f143, [%r67+-36864]; - fma.rn.ftz.f32 %f144, %f376, %f143, %f142; - fma.rn.ftz.f32 %f145, %f375, %f323, %f144; - fma.rn.ftz.f32 %f146, %f382, %f324, %f145; - fma.rn.ftz.f32 %f147, %f381, %f325, %f146; - fma.rn.ftz.f32 %f148, %f380, %f326, %f147; - fma.rn.ftz.f32 %f149, %f379, %f335, %f148; - st.global.f32 [%r68+-12288], %f149; - mul.ftz.f32 %f150, %f369, %f338; - fma.rn.ftz.f32 %f151, %f370, %f337, %f150; - fma.rn.ftz.f32 %f152, %f368, %f347, %f151; - fma.rn.ftz.f32 %f153, %f367, %f348, %f152; - fma.rn.ftz.f32 %f154, %f374, %f349, %f153; - fma.rn.ftz.f32 %f155, %f373, %f350, %f154; - fma.rn.ftz.f32 %f156, %f372, %f383, %f155; - fma.rn.ftz.f32 %f157, %f371, %f384, %f156; - fma.rn.ftz.f32 %f158, %f378, %f385, %f157; - ld.global.f32 %f159, [%r67+-32768]; - fma.rn.ftz.f32 %f160, %f377, %f159, %f158; - fma.rn.ftz.f32 %f161, %f376, %f323, %f160; - fma.rn.ftz.f32 %f162, %f375, %f324, %f161; - fma.rn.ftz.f32 %f163, %f382, %f325, %f162; - fma.rn.ftz.f32 %f164, %f381, %f326, %f163; - fma.rn.ftz.f32 %f165, %f380, %f335, %f164; - fma.rn.ftz.f32 %f166, %f379, %f336, %f165; - st.global.f32 [%r68+-8192], %f166; - mul.ftz.f32 %f167, %f369, %f347; - fma.rn.ftz.f32 %f168, %f370, %f338, %f167; - fma.rn.ftz.f32 %f169, %f368, %f348, %f168; - fma.rn.ftz.f32 %f170, %f367, %f349, %f169; - fma.rn.ftz.f32 %f171, %f374, %f350, %f170; - fma.rn.ftz.f32 %f172, %f373, %f383, %f171; - fma.rn.ftz.f32 %f173, %f372, %f384, %f172; - fma.rn.ftz.f32 %f174, %f371, %f385, %f173; - ld.global.f32 %f175, [%r67+-28672]; - fma.rn.ftz.f32 %f176, %f378, %f175, %f174; - fma.rn.ftz.f32 %f177, %f377, %f323, %f176; - fma.rn.ftz.f32 %f178, %f376, %f324, %f177; - fma.rn.ftz.f32 %f179, %f375, %f325, %f178; - fma.rn.ftz.f32 %f180, %f382, %f326, %f179; - fma.rn.ftz.f32 %f181, %f381, %f335, %f180; - fma.rn.ftz.f32 %f182, %f380, %f336, %f181; - fma.rn.ftz.f32 %f183, %f379, %f337, %f182; - st.global.f32 [%r68+-4096], %f183; - mul.ftz.f32 %f184, %f369, %f348; - fma.rn.ftz.f32 %f185, %f370, %f347, %f184; - fma.rn.ftz.f32 %f186, %f368, %f349, %f185; - fma.rn.ftz.f32 %f187, %f367, %f350, %f186; - fma.rn.ftz.f32 %f188, %f374, %f383, %f187; - fma.rn.ftz.f32 %f189, %f373, %f384, %f188; - fma.rn.ftz.f32 %f190, %f372, %f385, %f189; - ld.global.f32 %f191, [%r67+-24576]; - fma.rn.ftz.f32 %f192, %f371, %f191, %f190; - fma.rn.ftz.f32 %f193, %f378, %f323, %f192; - fma.rn.ftz.f32 %f194, %f377, %f324, %f193; - fma.rn.ftz.f32 %f195, %f376, %f325, %f194; - fma.rn.ftz.f32 %f196, %f375, %f326, %f195; - fma.rn.ftz.f32 %f197, %f382, %f335, %f196; - fma.rn.ftz.f32 %f198, %f381, %f336, %f197; - fma.rn.ftz.f32 %f199, %f380, %f337, %f198; - fma.rn.ftz.f32 %f200, %f379, %f338, %f199; - st.global.f32 [%r68], %f200; - mul.ftz.f32 %f201, %f369, %f349; - fma.rn.ftz.f32 %f202, %f370, %f348, %f201; - fma.rn.ftz.f32 %f203, %f368, %f350, %f202; - fma.rn.ftz.f32 %f204, %f367, %f383, %f203; - fma.rn.ftz.f32 %f205, %f374, %f384, %f204; - fma.rn.ftz.f32 %f206, %f373, %f385, %f205; - ld.global.f32 %f207, [%r67+-20480]; - fma.rn.ftz.f32 %f208, %f372, %f207, %f206; - fma.rn.ftz.f32 %f209, %f371, %f323, %f208; - fma.rn.ftz.f32 %f210, %f378, %f324, %f209; - fma.rn.ftz.f32 %f211, %f377, %f325, %f210; - fma.rn.ftz.f32 %f212, %f376, %f326, %f211; - fma.rn.ftz.f32 %f213, %f375, %f335, %f212; - fma.rn.ftz.f32 %f214, %f382, %f336, %f213; - fma.rn.ftz.f32 %f215, %f381, %f337, %f214; - fma.rn.ftz.f32 %f216, %f380, %f338, %f215; - fma.rn.ftz.f32 %f217, %f379, %f347, %f216; - st.global.f32 [%r68+4096], %f217; - mul.ftz.f32 %f218, %f369, %f350; - fma.rn.ftz.f32 %f219, %f370, %f349, %f218; - fma.rn.ftz.f32 %f220, %f368, %f383, %f219; - fma.rn.ftz.f32 %f221, %f367, %f384, %f220; - fma.rn.ftz.f32 %f222, %f374, %f385, %f221; - ld.global.f32 %f223, [%r67+-16384]; - fma.rn.ftz.f32 %f224, %f373, %f223, %f222; - fma.rn.ftz.f32 %f225, %f372, %f323, %f224; - fma.rn.ftz.f32 %f226, %f371, %f324, %f225; - fma.rn.ftz.f32 %f227, %f378, %f325, %f226; - fma.rn.ftz.f32 %f228, %f377, %f326, %f227; - fma.rn.ftz.f32 %f229, %f376, %f335, %f228; - fma.rn.ftz.f32 %f230, %f375, %f336, %f229; - fma.rn.ftz.f32 %f231, %f382, %f337, %f230; - fma.rn.ftz.f32 %f232, %f381, %f338, %f231; - fma.rn.ftz.f32 %f233, %f380, %f347, %f232; - fma.rn.ftz.f32 %f234, %f379, %f348, %f233; - st.global.f32 [%r68+8192], %f234; - mul.ftz.f32 %f235, %f369, %f383; - fma.rn.ftz.f32 %f236, %f370, %f350, %f235; - fma.rn.ftz.f32 %f237, %f368, %f384, %f236; - fma.rn.ftz.f32 %f238, %f367, %f385, %f237; - ld.global.f32 %f239, [%r67+-12288]; - fma.rn.ftz.f32 %f240, %f374, %f239, %f238; - fma.rn.ftz.f32 %f241, %f373, %f323, %f240; - fma.rn.ftz.f32 %f242, %f372, %f324, %f241; - fma.rn.ftz.f32 %f243, %f371, %f325, %f242; - fma.rn.ftz.f32 %f244, %f378, %f326, %f243; - fma.rn.ftz.f32 %f245, %f377, %f335, %f244; - fma.rn.ftz.f32 %f246, %f376, %f336, %f245; - fma.rn.ftz.f32 %f247, %f375, %f337, %f246; - fma.rn.ftz.f32 %f248, %f382, %f338, %f247; - fma.rn.ftz.f32 %f249, %f381, %f347, %f248; - fma.rn.ftz.f32 %f250, %f380, %f348, %f249; - fma.rn.ftz.f32 %f251, %f379, %f349, %f250; - st.global.f32 [%r68+12288], %f251; - mul.ftz.f32 %f252, %f369, %f384; - fma.rn.ftz.f32 %f253, %f370, %f383, %f252; - fma.rn.ftz.f32 %f254, %f368, %f385, %f253; - ld.global.f32 %f255, [%r67+-8192]; - fma.rn.ftz.f32 %f256, %f367, %f255, %f254; - fma.rn.ftz.f32 %f257, %f374, %f323, %f256; - fma.rn.ftz.f32 %f258, %f373, %f324, %f257; - fma.rn.ftz.f32 %f259, %f372, %f325, %f258; - fma.rn.ftz.f32 %f260, %f371, %f326, %f259; - fma.rn.ftz.f32 %f261, %f378, %f335, %f260; - fma.rn.ftz.f32 %f262, %f377, %f336, %f261; - fma.rn.ftz.f32 %f263, %f376, %f337, %f262; - fma.rn.ftz.f32 %f264, %f375, %f338, %f263; - fma.rn.ftz.f32 %f265, %f382, %f347, %f264; - fma.rn.ftz.f32 %f266, %f381, %f348, %f265; - fma.rn.ftz.f32 %f267, %f380, %f349, %f266; - fma.rn.ftz.f32 %f268, %f379, %f350, %f267; - st.global.f32 [%r68+16384], %f268; - mul.ftz.f32 %f269, %f369, %f385; - fma.rn.ftz.f32 %f270, %f370, %f384, %f269; - ld.global.f32 %f271, [%r67+-4096]; - fma.rn.ftz.f32 %f272, %f368, %f271, %f270; - fma.rn.ftz.f32 %f273, %f367, %f323, %f272; - fma.rn.ftz.f32 %f274, %f374, %f324, %f273; - fma.rn.ftz.f32 %f275, %f373, %f325, %f274; - fma.rn.ftz.f32 %f276, %f372, %f326, %f275; - fma.rn.ftz.f32 %f277, %f371, %f335, %f276; - fma.rn.ftz.f32 %f278, %f378, %f336, %f277; - fma.rn.ftz.f32 %f279, %f377, %f337, %f278; - fma.rn.ftz.f32 %f280, %f376, %f338, %f279; - fma.rn.ftz.f32 %f281, %f375, %f347, %f280; - fma.rn.ftz.f32 %f282, %f382, %f348, %f281; - fma.rn.ftz.f32 %f283, %f381, %f349, %f282; - fma.rn.ftz.f32 %f284, %f380, %f350, %f283; - fma.rn.ftz.f32 %f285, %f379, %f383, %f284; - st.global.f32 [%r68+20480], %f285; - ld.global.f32 %f286, [%r67]; - mul.ftz.f32 %f287, %f369, %f286; - fma.rn.ftz.f32 %f288, %f370, %f385, %f287; - fma.rn.ftz.f32 %f289, %f368, %f323, %f288; - fma.rn.ftz.f32 %f290, %f367, %f324, %f289; - fma.rn.ftz.f32 %f291, %f374, %f325, %f290; - fma.rn.ftz.f32 %f292, %f373, %f326, %f291; - fma.rn.ftz.f32 %f293, %f372, %f335, %f292; - fma.rn.ftz.f32 %f294, %f371, %f336, %f293; - fma.rn.ftz.f32 %f295, %f378, %f337, %f294; - fma.rn.ftz.f32 %f296, %f377, %f338, %f295; - fma.rn.ftz.f32 %f297, %f376, %f347, %f296; - fma.rn.ftz.f32 %f298, %f375, %f348, %f297; - fma.rn.ftz.f32 %f299, %f382, %f349, %f298; - fma.rn.ftz.f32 %f300, %f381, %f350, %f299; - fma.rn.ftz.f32 %f301, %f380, %f383, %f300; - fma.rn.ftz.f32 %f302, %f379, %f384, %f301; - st.global.f32 [%r68+24576], %f302; - mul.ftz.f32 %f303, %f369, %f323; - fma.rn.ftz.f32 %f304, %f370, %f286, %f303; - fma.rn.ftz.f32 %f305, %f368, %f324, %f304; - fma.rn.ftz.f32 %f306, %f367, %f325, %f305; - fma.rn.ftz.f32 %f307, %f374, %f326, %f306; - fma.rn.ftz.f32 %f308, %f373, %f335, %f307; - fma.rn.ftz.f32 %f309, %f372, %f336, %f308; - fma.rn.ftz.f32 %f310, %f371, %f337, %f309; - fma.rn.ftz.f32 %f311, %f378, %f338, %f310; - fma.rn.ftz.f32 %f312, %f377, %f347, %f311; - fma.rn.ftz.f32 %f313, %f376, %f348, %f312; - fma.rn.ftz.f32 %f314, %f375, %f349, %f313; - fma.rn.ftz.f32 %f315, %f382, %f350, %f314; - fma.rn.ftz.f32 %f316, %f381, %f383, %f315; - fma.rn.ftz.f32 %f317, %f380, %f384, %f316; - fma.rn.ftz.f32 %f318, %f379, %f385, %f317; - st.global.f32 [%r68+28672], %f318; - add.s32 %r70, %r70, 65536; - add.s32 %r69, %r69, -1; - setp.ne.s32 %p1, %r69, 0; - mov.f32 %f383, %f383; - mov.f32 %f384, %f384; - mov.f32 %f385, %f385; - mov.f32 %f366, %f286; - @%p1 bra BB0_1; - - ret; -} - - diff --git a/RTCP/GPUProc/src/UHEP/Transpose.cl-0.ptx b/RTCP/GPUProc/src/UHEP/Transpose.cl-0.ptx deleted file mode 100644 index cb8248be325fa81a44e309be3a1a84ebe64e5e75..0000000000000000000000000000000000000000 --- a/RTCP/GPUProc/src/UHEP/Transpose.cl-0.ptx +++ /dev/null @@ -1,142 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Tue Feb 7 07:15:59 2012 (1328595359) -// Driver 295.20 -// - -.version 3.0 -.target sm_21, texmode_independent -.address_size 32 - -.extern .shared .align 16 .b8 shr_1_tmp[4352]; - -.entry UHEP_Transpose( - .param .u32 .ptr .global .align 1 UHEP_Transpose_param_0, - .param .u32 .ptr .global .align 1 UHEP_Transpose_param_1, - .param .u32 .ptr .global .align 4 UHEP_Transpose_param_2 -) -{ - .reg .f32 %f<30>; - .reg .pred %p<7>; - .reg .s32 %r<61>; - - - ld.param.u32 %r25, [UHEP_Transpose_param_0]; - ld.param.u32 %r26, [UHEP_Transpose_param_1]; - ld.param.u32 %r27, [UHEP_Transpose_param_2]; - // inline asm - mov.u32 %r12, %envreg4; - // inline asm - // inline asm - mov.u32 %r13, %ntid.y; - // inline asm - // inline asm - mov.u32 %r14, %ctaid.y; - // inline asm - // inline asm - mov.u32 %r15, %tid.y; - // inline asm - add.s32 %r28, %r15, %r12; - mad.lo.s32 %r29, %r14, %r13, %r28; - // inline asm - mov.u32 %r16, %envreg5; - // inline asm - // inline asm - mov.u32 %r17, %ntid.z; - // inline asm - // inline asm - mov.u32 %r18, %ctaid.z; - // inline asm - // inline asm - mov.u32 %r19, %tid.z; - // inline asm - add.s32 %r30, %r19, %r16; - mad.lo.s32 %r31, %r18, %r17, %r30; - // inline asm - mov.u32 %r20, %tid.x; - // inline asm - shl.b32 %r32, %r29, 4; - and.b32 %r33, %r20, 15; - add.s32 %r34, %r32, %r33; - // inline asm - mov.u32 %r21, %tid.x; - // inline asm - shr.u32 %r35, %r21, 4; - shl.b32 %r36, %r31, 4; - add.s32 %r37, %r36, %r35; - shl.b32 %r38, %r37, 2; - add.s32 %r39, %r27, %r38; - setp.lt.u32 %p4, %r34, 4; - ld.global.u32 %r40, [%r39]; - setp.gt.s32 %p5, %r40, -1; - and.pred %p1, %p4, %p5; - // inline asm - mov.u32 %r22, %tid.x; - // inline asm - shr.u32 %r41, %r22, 4; - // inline asm - mov.u32 %r23, %tid.x; - // inline asm - and.b32 %r42, %r23, 15; - add.s32 %r43, %r36, %r42; - shl.b32 %r44, %r43, 2; - add.s32 %r45, %r27, %r44; - add.s32 %r46, %r41, %r32; - setp.lt.u32 %p2, %r46, 4; - mov.u32 %r47, shr_1_tmp; - mad.lo.s32 %r48, %r33, 272, %r47; - and.b32 %r49, %r21, -16; - add.s32 %r1, %r48, %r49; - ld.global.u32 %r50, [%r45]; - setp.gt.s32 %p3, %r50, -1; - mad.lo.s32 %r51, %r41, 272, %r47; - shl.b32 %r52, %r42, 4; - add.s32 %r2, %r51, %r52; - shl.b32 %r53, %r43, 3; - mad.lo.s32 %r54, %r46, 8511488, %r53; - add.s32 %r55, %r54, %r25; - add.s32 %r60, %r55, 4255744; - shl.b32 %r56, %r34, 4; - mad.lo.s32 %r57, %r40, 66496, %r56; - add.s32 %r59, %r26, %r57; - mov.u32 %r58, 1039; - -BB0_1: - add.s32 %r8, %r60, -4255744; - @!%p1 bra BB0_3; - - ld.global.v4.f32 {%f22, %f23, %f24, %f25}, [%r59]; - st.shared.v4.f32 [%r1], {%f22, %f23, %f24, %f25}; - -BB0_3: - bar.sync 0; - @!%p2 bra BB0_8; - - @%p3 bra BB0_6; - - mov.f32 %f1, 0f00000000; - mov.f32 %f26, %f1; - mov.f32 %f27, %f1; - mov.f32 %f28, %f1; - mov.f32 %f29, %f1; - bra.uni BB0_7; - -BB0_6: - ld.shared.v4.f32 {%f26, %f27, %f28, %f29}, [%r2]; - -BB0_7: - st.global.v2.f32 [%r8], {%f26, %f27}; - st.global.v2.f32 [%r8+4255744], {%f28, %f29}; - -BB0_8: - bar.sync 0; - add.s32 %r60, %r60, 4096; - add.s32 %r59, %r59, 64; - add.s32 %r58, %r58, -1; - setp.ne.s32 %p6, %r58, 0; - @%p6 bra BB0_1; - - ret; -} - - diff --git a/RTCP/GPUProc/src/UHEP/Trigger.cl-0.ptx b/RTCP/GPUProc/src/UHEP/Trigger.cl-0.ptx deleted file mode 100644 index ae2713f9037df945f18a988f001c6c0d0abbf29b..0000000000000000000000000000000000000000 --- a/RTCP/GPUProc/src/UHEP/Trigger.cl-0.ptx +++ /dev/null @@ -1,511 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Tue Feb 7 07:15:59 2012 (1328595359) -// Driver 295.20 -// - -.version 3.0 -.target sm_21, texmode_independent -.address_size 32 - -.extern .shared .align 64 .b8 shr_4_tmp[16384]; - -.entry trigger( - .param .u32 .ptr .global .align 1 trigger_param_0, - .param .u32 .ptr .global .align 4 trigger_param_1 -) -{ - .reg .f32 %f<434>; - .reg .pred %p<21>; - .reg .s32 %r<71>; - - - ld.param.u32 %r31, [trigger_param_1]; - // inline asm - mov.u32 %r23, %tid.x; - // inline asm - // inline asm - mov.u32 %r24, %tid.y; - // inline asm - shl.b32 %r32, %r24, 4; - // inline asm - mov.u32 %r25, %envreg5; - // inline asm - // inline asm - mov.u32 %r26, %ntid.z; - // inline asm - // inline asm - mov.u32 %r27, %ctaid.z; - // inline asm - // inline asm - mov.u32 %r28, %tid.z; - // inline asm - add.s32 %r33, %r28, %r25; - mad.lo.s32 %r3, %r27, %r26, %r33; - shl.b32 %r34, %r24, 10; - mov.u32 %r35, shr_4_tmp; - add.s32 %r36, %r35, %r34; - shl.b32 %r37, %r23, 6; - add.s32 %r4, %r36, %r37; - add.s32 %r2, %r32, %r23; - shl.b32 %r38, %r2, 12; - and.b32 %r5, %r38, 16773120; - shl.b32 %r39, %r3, 21; - add.s32 %r40, %r23, %r39; - shl.b32 %r41, %r24, 12; - add.s32 %r42, %r40, %r41; - shl.b32 %r43, %r42, 2; - add.s32 %r70, %r31, %r43; - shl.b32 %r44, %r24, 6; - shl.b32 %r45, %r23, 2; - add.s32 %r46, %r35, %r44; - add.s32 %r47, %r46, %r45; - add.s32 %r7, %r47, 3072; - mov.f32 %f420, 0f00000000; - mov.f32 %f430, %f420; - mov.f32 %f431, %f420; - mov.f32 %f432, %f420; - mov.f32 %f433, %f420; - mov.f32 %f422, %f405; - mov.f32 %f419, %f420; - mov.f32 %f421, %f420; - mov.f32 %f418, %f420; - mov.u32 %r65, 0; - mov.u32 %r64, %r65; - mov.f32 %f426, %f420; - mov.f32 %f427, %f420; - mov.f32 %f428, %f420; - mov.f32 %f429, %f420; - mov.f32 %f423, %f420; - mov.f32 %f424, %f420; - mov.f32 %f425, %f420; - -BB0_1: - mov.u32 %r68, %r70; - mov.u32 %r8, %r68; - shl.b32 %r49, %r64, 4; - add.s32 %r12, %r5, %r49; - mov.u32 %r67, 16; - mov.u32 %r66, %r7; - mov.u32 %r69, %r8; - -BB0_2: - mov.u32 %r15, %r69; - ld.global.f32 %f40, [%r15]; - ld.global.f32 %f41, [%r15+4194304]; - mul.ftz.f32 %f42, %f41, %f41; - fma.rn.ftz.f32 %f43, %f40, %f40, %f42; - st.shared.f32 [%r66+-3072], %f43; - sub.ftz.f32 %f44, %f43, %f419; - add.ftz.f32 %f45, %f421, 0f3F800000; - div.approx.ftz.f32 %f46, %f44, %f45; - add.ftz.f32 %f47, %f419, %f46; - sub.ftz.f32 %f48, %f43, %f47; - fma.rn.ftz.f32 %f49, %f44, %f48, %f420; - ld.global.f32 %f50, [%r15+262144]; - ld.global.f32 %f51, [%r15+4456448]; - mul.ftz.f32 %f52, %f51, %f51; - fma.rn.ftz.f32 %f53, %f50, %f50, %f52; - st.shared.f32 [%r66+-2048], %f53; - add.ftz.f32 %f54, %f45, 0f3F800000; - sub.ftz.f32 %f55, %f53, %f47; - div.approx.ftz.f32 %f56, %f55, %f54; - add.ftz.f32 %f57, %f47, %f56; - sub.ftz.f32 %f58, %f53, %f57; - fma.rn.ftz.f32 %f59, %f55, %f58, %f49; - ld.global.f32 %f60, [%r15+524288]; - ld.global.f32 %f61, [%r15+4718592]; - mul.ftz.f32 %f62, %f61, %f61; - fma.rn.ftz.f32 %f63, %f60, %f60, %f62; - st.shared.f32 [%r66+-1024], %f63; - add.ftz.f32 %f64, %f54, 0f3F800000; - sub.ftz.f32 %f65, %f63, %f57; - div.approx.ftz.f32 %f66, %f65, %f64; - add.ftz.f32 %f67, %f57, %f66; - sub.ftz.f32 %f68, %f63, %f67; - fma.rn.ftz.f32 %f69, %f65, %f68, %f59; - ld.global.f32 %f70, [%r15+786432]; - ld.global.f32 %f71, [%r15+4980736]; - mul.ftz.f32 %f72, %f71, %f71; - fma.rn.ftz.f32 %f73, %f70, %f70, %f72; - st.shared.f32 [%r66], %f73; - add.ftz.f32 %f421, %f64, 0f3F800000; - sub.ftz.f32 %f74, %f73, %f67; - div.approx.ftz.f32 %f75, %f74, %f421; - add.ftz.f32 %f419, %f67, %f75; - sub.ftz.f32 %f76, %f73, %f419; - fma.rn.ftz.f32 %f420, %f74, %f76, %f69; - add.s32 %r16, %r15, 1048576; - add.s32 %r66, %r66, 4096; - add.s32 %r67, %r67, -4; - setp.ne.s32 %p1, %r67, 0; - mov.u32 %r69, %r16; - @%p1 bra BB0_2; - - bar.sync 0; - ld.shared.v4.f32 {%f258, %f259, %f260, %f261}, [%r4]; - add.ftz.f32 %f79, %f422, %f258; - sub.ftz.f32 %f81, %f79, %f423; - add.ftz.f32 %f83, %f81, %f259; - sub.ftz.f32 %f85, %f83, %f424; - add.ftz.f32 %f87, %f85, %f260; - sub.ftz.f32 %f89, %f87, %f425; - add.ftz.f32 %f91, %f89, %f261; - sub.ftz.f32 %f93, %f91, %f426; - ld.shared.v4.f32 {%f274, %f423, %f424, %f425}, [%r4+16]; - add.ftz.f32 %f95, %f93, %f274; - sub.ftz.f32 %f97, %f95, %f427; - add.ftz.f32 %f99, %f97, %f423; - sub.ftz.f32 %f101, %f99, %f428; - add.ftz.f32 %f103, %f101, %f424; - sub.ftz.f32 %f105, %f103, %f429; - add.ftz.f32 %f107, %f105, %f425; - sub.ftz.f32 %f109, %f107, %f430; - ld.shared.v4.f32 {%f426, %f427, %f428, %f429}, [%r4+32]; - add.ftz.f32 %f111, %f109, %f426; - sub.ftz.f32 %f113, %f111, %f431; - add.ftz.f32 %f115, %f113, %f427; - sub.ftz.f32 %f117, %f115, %f432; - add.ftz.f32 %f119, %f117, %f428; - sub.ftz.f32 %f121, %f119, %f433; - add.ftz.f32 %f123, %f121, %f429; - sub.ftz.f32 %f124, %f123, %f258; - ld.shared.v4.f32 {%f430, %f431, %f432, %f433}, [%r4+48]; - add.ftz.f32 %f126, %f124, %f430; - sub.ftz.f32 %f127, %f126, %f259; - add.ftz.f32 %f129, %f127, %f431; - sub.ftz.f32 %f130, %f129, %f260; - add.ftz.f32 %f132, %f130, %f432; - sub.ftz.f32 %f133, %f132, %f261; - add.ftz.f32 %f135, %f133, %f433; - sub.ftz.f32 %f136, %f135, %f274; - max.f32 %f137, %f81, %f85; - max.f32 %f138, %f89, %f93; - max.f32 %f139, %f137, %f138; - max.f32 %f140, %f97, %f101; - max.f32 %f141, %f105, %f109; - max.f32 %f142, %f140, %f141; - max.f32 %f143, %f113, %f117; - max.f32 %f144, %f121, %f124; - max.f32 %f145, %f143, %f144; - max.f32 %f146, %f127, %f130; - max.f32 %f147, %f133, %f136; - max.f32 %f148, %f146, %f147; - max.f32 %f149, %f139, %f142; - max.f32 %f150, %f145, %f148; - max.f32 %f151, %f149, %f150; - setp.ltu.ftz.f32 %p2, %f151, %f418; - selp.f32 %f418, %f418, %f151, %p2; - selp.b32 %r65, %r65, %r12, %p2; - mov.f32 %f306, %f81; - mov.f32 %f307, %f85; - mov.f32 %f308, %f89; - mov.f32 %f309, %f93; - mov.f32 %f326, %f97; - mov.f32 %f327, %f101; - mov.f32 %f328, %f105; - mov.f32 %f329, %f109; - mov.f32 %f346, %f113; - mov.f32 %f347, %f117; - mov.f32 %f348, %f121; - mov.f32 %f349, %f124; - mov.f32 %f362, %f127; - mov.f32 %f363, %f130; - mov.f32 %f364, %f133; - mov.f32 %f422, %f136; - bar.sync 0; - add.s32 %r64, %r64, 1; - add.s32 %r21, %r8, 64; - setp.ne.s32 %p3, %r64, 256; - mov.u32 %r70, %r21; - @%p3 bra BB0_1; - - shl.b32 %r50, %r2, 2; - add.s32 %r22, %r35, %r50; - st.shared.f32 [%r22], %f419; - st.shared.f32 [%r22+1024], %f420; - st.shared.f32 [%r22+2048], %f418; - st.shared.u32 [%r22+3072], %r65; - setp.lt.u32 %p4, %r2, 128; - @%p4 bra BB0_5; - bra.uni BB0_7; - -BB0_5: - ld.shared.f32 %f152, [%r22+1536]; - ld.shared.f32 %f153, [%r22+512]; - sub.ftz.f32 %f154, %f153, %f419; - add.ftz.f32 %f155, %f419, %f153; - div.rn.ftz.f32 %f156, %f155, 0f40000000; - st.shared.f32 [%r22], %f156; - add.ftz.f32 %f157, %f420, %f152; - mul.ftz.f32 %f158, %f154, %f154; - mul.ftz.f32 %f159, %f158, %f421; - div.rn.ftz.f32 %f160, %f159, 0f40000000; - add.ftz.f32 %f161, %f157, %f160; - st.shared.f32 [%r22+1024], %f161; - add.ftz.f32 %f421, %f421, %f421; - ld.shared.f32 %f13, [%r22+2560]; - setp.lt.ftz.f32 %p5, %f418, %f13; - @%p5 bra BB0_6; - bra.uni BB0_7; - -BB0_6: - st.shared.f32 [%r22+2048], %f13; - ld.shared.u32 %r52, [%r22+3584]; - st.shared.u32 [%r22+3072], %r52; - -BB0_7: - bar.sync 0; - setp.lt.u32 %p6, %r2, 64; - @%p6 bra BB0_8; - bra.uni BB0_10; - -BB0_8: - ld.shared.f32 %f162, [%r22]; - ld.shared.f32 %f163, [%r22+1024]; - ld.shared.f32 %f164, [%r22+1280]; - ld.shared.f32 %f165, [%r22+256]; - sub.ftz.f32 %f166, %f165, %f162; - add.ftz.f32 %f167, %f162, %f165; - div.rn.ftz.f32 %f168, %f167, 0f40000000; - st.shared.f32 [%r22], %f168; - add.ftz.f32 %f169, %f163, %f164; - mul.ftz.f32 %f170, %f166, %f166; - mul.ftz.f32 %f171, %f170, %f421; - div.rn.ftz.f32 %f172, %f171, 0f40000000; - add.ftz.f32 %f173, %f169, %f172; - st.shared.f32 [%r22+1024], %f173; - add.ftz.f32 %f421, %f421, %f421; - ld.shared.f32 %f16, [%r22+2304]; - ld.shared.f32 %f174, [%r22+2048]; - setp.lt.ftz.f32 %p7, %f174, %f16; - @%p7 bra BB0_9; - bra.uni BB0_10; - -BB0_9: - st.shared.f32 [%r22+2048], %f16; - ld.shared.u32 %r53, [%r22+3328]; - st.shared.u32 [%r22+3072], %r53; - -BB0_10: - bar.sync 0; - setp.lt.u32 %p8, %r2, 32; - @%p8 bra BB0_11; - bra.uni BB0_13; - -BB0_11: - ld.shared.f32 %f175, [%r22]; - ld.shared.f32 %f176, [%r22+1024]; - ld.shared.f32 %f177, [%r22+1152]; - ld.shared.f32 %f178, [%r22+128]; - sub.ftz.f32 %f179, %f178, %f175; - add.ftz.f32 %f180, %f175, %f178; - div.rn.ftz.f32 %f181, %f180, 0f40000000; - st.shared.f32 [%r22], %f181; - add.ftz.f32 %f182, %f176, %f177; - mul.ftz.f32 %f183, %f179, %f179; - mul.ftz.f32 %f184, %f183, %f421; - div.rn.ftz.f32 %f185, %f184, 0f40000000; - add.ftz.f32 %f186, %f182, %f185; - st.shared.f32 [%r22+1024], %f186; - add.ftz.f32 %f421, %f421, %f421; - ld.shared.f32 %f19, [%r22+2176]; - ld.shared.f32 %f187, [%r22+2048]; - setp.lt.ftz.f32 %p9, %f187, %f19; - @%p9 bra BB0_12; - bra.uni BB0_13; - -BB0_12: - st.shared.f32 [%r22+2048], %f19; - ld.shared.u32 %r54, [%r22+3200]; - st.shared.u32 [%r22+3072], %r54; - -BB0_13: - bar.sync 0; - setp.lt.u32 %p10, %r2, 16; - @%p10 bra BB0_14; - bra.uni BB0_16; - -BB0_14: - ld.shared.f32 %f188, [%r22]; - ld.shared.f32 %f189, [%r22+1024]; - ld.shared.f32 %f190, [%r22+1088]; - ld.shared.f32 %f191, [%r22+64]; - sub.ftz.f32 %f192, %f191, %f188; - add.ftz.f32 %f193, %f188, %f191; - div.rn.ftz.f32 %f194, %f193, 0f40000000; - st.shared.f32 [%r22], %f194; - add.ftz.f32 %f195, %f189, %f190; - mul.ftz.f32 %f196, %f192, %f192; - mul.ftz.f32 %f197, %f196, %f421; - div.rn.ftz.f32 %f198, %f197, 0f40000000; - add.ftz.f32 %f199, %f195, %f198; - st.shared.f32 [%r22+1024], %f199; - add.ftz.f32 %f421, %f421, %f421; - ld.shared.f32 %f22, [%r22+2112]; - ld.shared.f32 %f200, [%r22+2048]; - setp.lt.ftz.f32 %p11, %f200, %f22; - @%p11 bra BB0_15; - bra.uni BB0_16; - -BB0_15: - st.shared.f32 [%r22+2048], %f22; - ld.shared.u32 %r55, [%r22+3136]; - st.shared.u32 [%r22+3072], %r55; - -BB0_16: - bar.sync 0; - setp.lt.u32 %p12, %r2, 8; - @%p12 bra BB0_17; - bra.uni BB0_19; - -BB0_17: - ld.shared.f32 %f201, [%r22]; - ld.shared.f32 %f202, [%r22+1024]; - ld.shared.f32 %f203, [%r22+1056]; - ld.shared.f32 %f204, [%r22+32]; - sub.ftz.f32 %f205, %f204, %f201; - add.ftz.f32 %f206, %f201, %f204; - div.rn.ftz.f32 %f207, %f206, 0f40000000; - st.shared.f32 [%r22], %f207; - add.ftz.f32 %f208, %f202, %f203; - mul.ftz.f32 %f209, %f205, %f205; - mul.ftz.f32 %f210, %f209, %f421; - div.rn.ftz.f32 %f211, %f210, 0f40000000; - add.ftz.f32 %f212, %f208, %f211; - st.shared.f32 [%r22+1024], %f212; - add.ftz.f32 %f421, %f421, %f421; - ld.shared.f32 %f25, [%r22+2080]; - ld.shared.f32 %f213, [%r22+2048]; - setp.lt.ftz.f32 %p13, %f213, %f25; - @%p13 bra BB0_18; - bra.uni BB0_19; - -BB0_18: - st.shared.f32 [%r22+2048], %f25; - ld.shared.u32 %r56, [%r22+3104]; - st.shared.u32 [%r22+3072], %r56; - -BB0_19: - bar.sync 0; - setp.lt.u32 %p14, %r2, 4; - @%p14 bra BB0_20; - bra.uni BB0_22; - -BB0_20: - ld.shared.f32 %f214, [%r22]; - ld.shared.f32 %f215, [%r22+1024]; - ld.shared.f32 %f216, [%r22+1040]; - ld.shared.f32 %f217, [%r22+16]; - sub.ftz.f32 %f218, %f217, %f214; - add.ftz.f32 %f219, %f214, %f217; - div.rn.ftz.f32 %f220, %f219, 0f40000000; - st.shared.f32 [%r22], %f220; - add.ftz.f32 %f221, %f215, %f216; - mul.ftz.f32 %f222, %f218, %f218; - mul.ftz.f32 %f223, %f222, %f421; - div.rn.ftz.f32 %f224, %f223, 0f40000000; - add.ftz.f32 %f225, %f221, %f224; - st.shared.f32 [%r22+1024], %f225; - add.ftz.f32 %f421, %f421, %f421; - ld.shared.f32 %f28, [%r22+2064]; - ld.shared.f32 %f226, [%r22+2048]; - setp.lt.ftz.f32 %p15, %f226, %f28; - @%p15 bra BB0_21; - bra.uni BB0_22; - -BB0_21: - st.shared.f32 [%r22+2048], %f28; - ld.shared.u32 %r57, [%r22+3088]; - st.shared.u32 [%r22+3072], %r57; - -BB0_22: - bar.sync 0; - setp.lt.u32 %p16, %r2, 2; - @%p16 bra BB0_23; - bra.uni BB0_25; - -BB0_23: - ld.shared.f32 %f227, [%r22]; - ld.shared.f32 %f228, [%r22+1024]; - ld.shared.f32 %f229, [%r22+1032]; - ld.shared.f32 %f230, [%r22+8]; - sub.ftz.f32 %f231, %f230, %f227; - add.ftz.f32 %f232, %f227, %f230; - div.rn.ftz.f32 %f233, %f232, 0f40000000; - st.shared.f32 [%r22], %f233; - add.ftz.f32 %f234, %f228, %f229; - mul.ftz.f32 %f235, %f231, %f231; - mul.ftz.f32 %f236, %f235, %f421; - div.rn.ftz.f32 %f237, %f236, 0f40000000; - add.ftz.f32 %f238, %f234, %f237; - st.shared.f32 [%r22+1024], %f238; - add.ftz.f32 %f421, %f421, %f421; - ld.shared.f32 %f31, [%r22+2056]; - ld.shared.f32 %f239, [%r22+2048]; - setp.lt.ftz.f32 %p17, %f239, %f31; - @%p17 bra BB0_24; - bra.uni BB0_25; - -BB0_24: - st.shared.f32 [%r22+2048], %f31; - ld.shared.u32 %r58, [%r22+3080]; - st.shared.u32 [%r22+3072], %r58; - -BB0_25: - bar.sync 0; - setp.eq.s32 %p18, %r2, 0; - @%p18 bra BB0_26; - bra.uni BB0_28; - -BB0_26: - ld.shared.f32 %f240, [%r22]; - ld.shared.f32 %f241, [%r22+1024]; - ld.shared.f32 %f242, [%r22+1028]; - ld.shared.f32 %f243, [%r22+4]; - sub.ftz.f32 %f244, %f243, %f240; - add.ftz.f32 %f245, %f240, %f243; - div.rn.ftz.f32 %f246, %f245, 0f40000000; - st.shared.f32 [%r22], %f246; - add.ftz.f32 %f247, %f241, %f242; - mul.ftz.f32 %f248, %f244, %f244; - mul.ftz.f32 %f249, %f248, %f421; - div.rn.ftz.f32 %f250, %f249, 0f40000000; - add.ftz.f32 %f251, %f247, %f250; - st.shared.f32 [%r22+1024], %f251; - add.ftz.f32 %f421, %f421, %f421; - ld.shared.f32 %f34, [%r22+2052]; - ld.shared.f32 %f252, [%r22+2048]; - setp.lt.ftz.f32 %p19, %f252, %f34; - @%p19 bra BB0_27; - bra.uni BB0_28; - -BB0_27: - st.shared.f32 [%r22+2048], %f34; - ld.shared.u32 %r59, [%r22+3076]; - st.shared.u32 [%r22+3072], %r59; - -BB0_28: - bar.sync 0; - @%p18 bra BB0_30; - - ret; - -BB0_30: - shl.b32 %r60, %r3, 4; - ld.param.u32 %r63, [trigger_param_0]; - add.s32 %r61, %r63, %r60; - ld.shared.f32 %f253, [shr_4_tmp]; - st.global.f32 [%r61], %f253; - add.ftz.f32 %f254, %f421, 0fBF800000; - ld.shared.f32 %f255, [shr_4_tmp+1024]; - div.approx.ftz.f32 %f256, %f255, %f254; - st.global.f32 [%r61+4], %f256; - ld.shared.f32 %f257, [shr_4_tmp+2048]; - st.global.f32 [%r61+8], %f257; - ld.shared.u32 %r62, [shr_4_tmp+3072]; - st.global.u32 [%r61+12], %r62; - ret; -} - - diff --git a/RTCP/GPUProc/src/octave-core b/RTCP/GPUProc/src/octave-core deleted file mode 100644 index 608aba01c896492a0c4befa1f98dc1416cc6cf6c..0000000000000000000000000000000000000000 Binary files a/RTCP/GPUProc/src/octave-core and /dev/null differ