diff --git a/.gitattributes b/.gitattributes index cb6389607cec7ec700eda30eab1cf3c5be602ad6..95194ed9a9e03fcec68412e7c8a99953e592302e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3594,6 +3594,7 @@ RTCP/IONProc/test/tDelayCompensation.sh -text RTCP/IONProc/test/tSSH.cc -text RTCP/IONProc/test/tSSH.sh -text RTCP/IONProc/test/tSSH.stdout -text +RTCP/Interface/include/Interface/BGPAsm.h -text RTCP/Interface/include/Interface/BeamCoordinates.h -text RTCP/Interface/include/Interface/BeamFormedData.h -text RTCP/Interface/include/Interface/DataFactory.h -text @@ -3606,9 +3607,11 @@ RTCP/Interface/include/Interface/Stream.h -text RTCP/Interface/include/Interface/StreamableData.h -text RTCP/Interface/include/Interface/TransposedBeamFormedData.h -text RTCP/Interface/include/Interface/TriggerData.h -text +RTCP/Interface/src/BGPAsm.S -text RTCP/Interface/src/BeamCoordinates.cc -text RTCP/Interface/src/DataFactory.cc -text RTCP/Interface/src/Stream.cc -text +RTCP/Interface/test/tCorrelatedData.cc -text RTCP/LofarStMan/CMakeLists.txt -text RTCP/LofarStMan/include/LofarStMan/CMakeLists.txt -text RTCP/LofarStMan/src/CMakeLists.txt -text diff --git a/RTCP/CNProc/src/BeamFormerAsm.S b/RTCP/CNProc/src/BeamFormerAsm.S index 9b4eadf8ecb79f34f7af25a8eed045a44506b9a6..680e37ebf8b0322aad7b79aa09df167f1b1236b3 100644 --- a/RTCP/CNProc/src/BeamFormerAsm.S +++ b/RTCP/CNProc/src/BeamFormerAsm.S @@ -1,765 +1,5 @@ #if defined HAVE_BGP -.global _add_2_single_precision_vectors -_add_2_single_precision_vectors: - - li 0,-16 - stfpdux 14,1,0 - stfpdux 15,1,0 - - srwi 6,6,4 - li 11,8 - subi 6,6,1 - mtctr 6 - - cmpwi 0,6,0 - lfpsx 0,0,4 - lfpsx 1,0,5 - lfpsux 2,4,11 - lfpsux 3,5,11 - lfpsux 4,4,11 - lfpsux 5,5,11 - lfpsux 6,4,11 - lfpsux 7,5,11 - lfpsux 8,4,11 - lfpsux 9,5,11 - lfpsux 10,4,11 - lfpsux 11,5,11 - fpadd 0,0,1 - lfpsux 12,4,11 - lfpsux 13,5,11 - fpadd 2,2,3 - lfpsux 14,4,11 - lfpsux 15,5,11 - stfpsx 0,0,3 - fpadd 4,4,5 - beq- 0,1f - -0: lfpsux 0,4,11 - lfpsux 1,5,11 - stfpsux 2,3,11 - fpadd 6,6,7 - lfpsux 2,4,11 - lfpsux 3,5,11 - stfpsux 4,3,11 - fpadd 8,8,9 - lfpsux 4,4,11 - lfpsux 5,5,11 - stfpsux 6,3,11 - fpadd 10,10,11 - lfpsux 6,4,11 - lfpsux 7,5,11 - stfpsux 8,3,11 - fpadd 12,12,13 - lfpsux 8,4,11 - lfpsux 9,5,11 - stfpsux 10,3,11 - fpadd 14,14,15 - lfpsux 10,4,11 - lfpsux 11,5,11 - stfpsux 12,3,11 - fpadd 0,0,1 - lfpsux 12,4,11 - lfpsux 13,5,11 - stfpsux 14,3,11 - fpadd 2,2,3 - lfpsux 14,4,11 - lfpsux 15,5,11 - stfpsux 0,3,11 - fpadd 4,4,5 - bdnz 0b - -1: stfpsux 2,3,11 - fpadd 6,6,7 - stfpsux 4,3,11 - fpadd 8,8,9 - stfpsux 6,3,11 - fpadd 10,10,11 - stfpsux 8,3,11 - fpadd 12,12,13 - stfpsux 10,3,11 - fpadd 14,14,15 - stfpsux 12,3,11 - stfpsux 14,3,11 - - li 0,16 # restore call-saved registers - lfpdx 15,0,1 - lfpdux 14,1,0 - addi 1,1,16 - - blr - - -.global _add_3_single_precision_vectors -_add_3_single_precision_vectors: - - li 0,-16 - stfpdux 14,1,0 - stfpdux 15,1,0 - stfpdux 16,1,0 - stfpdux 17,1,0 - stfpdux 18,1,0 - stfpdux 19,1,0 - stfpdux 20,1,0 - stfpdux 21,1,0 - stfpdux 22,1,0 - stfpdux 23,1,0 - - srwi 7,7,4 - li 11,8 - subi 7,7,1 - mtctr 7 - - cmpwi 0,7,0 - lfpsx 0,0,4 - lfpsx 1,0,5 - lfpsx 2,0,6 - lfpsux 3,4,11 - lfpsux 4,5,11 - lfpsux 5,6,11 - lfpsux 6,4,11 - lfpsux 7,5,11 - lfpsux 8,6,11 - lfpsux 9,4,11 - lfpsux 10,5,11 - lfpsux 11,6,11 - lfpsux 12,4,11 - lfpsux 13,5,11 - lfpsux 14,6,11 - lfpsux 15,4,11 - lfpsux 16,5,11 - fpadd 0,0,1 - lfpsux 17,6,11 - lfpsux 18,4,11 - lfpsux 19,5,11 - fpadd 3,3,4 - lfpsux 20,6,11 - fpadd 0,0,2 - lfpsux 21,4,11 - lfpsux 22,5,11 - fpadd 6,6,7 - lfpsux 23,6,11 - fpadd 3,3,5 - stfpsx 0,0,3 - beq- 0,1f - li 10,32 - -0: lfpsux 0,4,11 - lfpsux 1,5,11 - dcbt 4,10 - fpadd 9,9,10 - lfpsux 2,6,11 - fpadd 6,6,8 - stfpsux 3,3,11 - lfpsux 3,4,11 - lfpsux 4,5,11 - fpadd 12,12,13 - lfpsux 5,6,11 - fpadd 9,9,11 - stfpsux 6,3,11 - lfpsux 6,4,11 - lfpsux 7,5,11 - fpadd 15,15,16 - lfpsux 8,6,11 - fpadd 12,12,14 - stfpsux 9,3,11 - lfpsux 9,4,11 - lfpsux 10,5,11 - fpadd 18,18,19 - lfpsux 11,6,11 - fpadd 15,15,17 - stfpsux 12,3,11 - lfpsux 12,4,11 - lfpsux 13,5,11 - fpadd 21,21,22 - lfpsux 14,6,11 - fpadd 18,18,20 - stfpsux 15,3,11 - lfpsux 15,4,11 - lfpsux 16,5,11 - fpadd 0,0,1 - lfpsux 17,6,11 - fpadd 21,21,23 - stfpsux 18,3,11 - lfpsux 18,4,11 - lfpsux 19,5,11 - fpadd 3,3,4 - lfpsux 20,6,11 - fpadd 0,0,2 - stfpsux 21,3,11 - lfpsux 21,4,11 - lfpsux 22,5,11 - fpadd 6,6,7 - lfpsux 23,6,11 - fpadd 3,3,5 - stfpsux 0,3,11 - bdnz 0b - -1: fpadd 9,9,10 - fpadd 6,6,8 - stfpsux 3,3,11 - fpadd 12,12,13 - fpadd 9,9,11 - stfpsux 6,3,11 - fpadd 15,15,16 - fpadd 12,12,14 - stfpsux 9,3,11 - fpadd 18,18,19 - fpadd 15,15,17 - stfpsux 12,3,11 - fpadd 21,21,22 - fpadd 18,18,20 - stfpsux 15,3,11 - fpadd 21,21,23 - stfpsux 18,3,11 - stfpsux 21,3,11 - - li 0,16 # restore call-saved registers - lfpdx 23,0,1 - lfpdux 22,1,0 - lfpdux 21,1,0 - lfpdux 20,1,0 - lfpdux 19,1,0 - lfpdux 18,1,0 - lfpdux 17,1,0 - lfpdux 16,1,0 - lfpdux 15,1,0 - lfpdux 14,1,0 - addi 1,1,16 - - blr - - -.global _add_4_single_precision_vectors -_add_4_single_precision_vectors: - - li 0,-16 - stfpdux 14,1,0 - stfpdux 15,1,0 - stfpdux 16,1,0 - stfpdux 17,1,0 - stfpdux 18,1,0 - stfpdux 19,1,0 - stfpdux 20,1,0 - stfpdux 21,1,0 - stfpdux 22,1,0 - stfpdux 23,1,0 - stfpdux 24,1,0 - stfpdux 25,1,0 - stfpdux 26,1,0 - stfpdux 27,1,0 - stfpdux 28,1,0 - stfpdux 29,1,0 - stfpdux 30,1,0 - stfpdux 31,1,0 - - srwi 8,8,4 - li 11,8 - subi 8,8,1 - mtctr 8 - - cmpwi 0,8,0 - lfpsx 0,0,4 - lfpsx 1,0,5 - lfpsx 2,0,6 - lfpsx 3,0,7 - lfpsux 4,4,11 - lfpsux 5,5,11 - lfpsux 6,6,11 - lfpsux 7,7,11 - lfpsux 8,4,11 - lfpsux 9,5,11 - lfpsux 10,6,11 - lfpsux 11,7,11 - lfpsux 12,4,11 - lfpsux 13,5,11 - lfpsux 14,6,11 - lfpsux 15,7,11 - lfpsux 16,4,11 - lfpsux 17,5,11 - lfpsux 18,6,11 - lfpsux 19,7,11 - lfpsux 20,4,11 - lfpsux 21,5,11 - fpadd 0,0,1 - fpadd 2,2,3 - lfpsux 22,6,11 - lfpsux 23,7,11 - lfpsux 24,4,11 - lfpsux 25,5,11 - fpadd 4,4,5 - fpadd 6,6,7 - lfpsux 26,6,11 - lfpsux 27,7,11 - fpadd 0,0,2 - lfpsux 28,4,11 - lfpsux 29,5,11 - fpadd 8,8,9 - fpadd 10,10,11 - lfpsux 30,6,11 - lfpsux 31,7,11 - - stfpsx 0,0,3 - fpadd 4,4,6 - beq- 0,1f - -0: lfpsux 0,4,11 - lfpsux 1,5,11 - fpadd 12,12,13 - fpadd 14,14,15 - lfpsux 2,6,11 - lfpsux 3,7,11 - - stfpsux 4,3,11 - fpadd 8,8,10 - - lfpsux 4,4,11 - lfpsux 5,5,11 - fpadd 16,16,17 - fpadd 18,18,19 - lfpsux 6,6,11 - lfpsux 7,7,11 - - stfpsux 8,3,11 - fpadd 12,12,14 - - lfpsux 8,4,11 - lfpsux 9,5,11 - fpadd 20,20,21 - fpadd 22,22,23 - lfpsux 10,6,11 - lfpsux 11,7,11 - - stfpsux 12,3,11 - fpadd 16,16,18 - - lfpsux 12,4,11 - lfpsux 13,5,11 - fpadd 24,24,25 - fpadd 26,26,27 - lfpsux 14,6,11 - lfpsux 15,7,11 - - stfpsux 16,3,11 - fpadd 20,20,22 - - lfpsux 16,4,11 - lfpsux 17,5,11 - fpadd 28,28,29 - fpadd 30,30,31 - lfpsux 18,6,11 - lfpsux 19,7,11 - - stfpsux 20,3,11 - fpadd 24,24,26 - - lfpsux 20,4,11 - lfpsux 21,5,11 - fpadd 0,0,1 - fpadd 2,2,3 - lfpsux 22,6,11 - lfpsux 23,7,11 - - stfpsux 24,3,11 - fpadd 28,28,30 - - lfpsux 24,4,11 - lfpsux 25,5,11 - fpadd 4,4,5 - fpadd 6,6,7 - lfpsux 26,6,11 - lfpsux 27,7,11 - - stfpsux 28,3,11 - fpadd 0,0,2 - - lfpsux 28,4,11 - lfpsux 29,5,11 - fpadd 8,8,9 - fpadd 10,10,11 - lfpsux 30,6,11 - lfpsux 31,7,11 - - stfpsux 0,3,11 - fpadd 4,4,6 - - bdnz 0b - -1: fpadd 12,12,13 - fpadd 14,14,15 - - stfpsux 4,3,11 - fpadd 8,8,10 - - fpadd 16,16,17 - fpadd 18,18,19 - - stfpsux 8,3,11 - fpadd 12,12,14 - - fpadd 20,20,21 - fpadd 22,22,23 - - stfpsux 12,3,11 - fpadd 16,16,18 - - fpadd 24,24,25 - fpadd 26,26,27 - - stfpsux 16,3,11 - fpadd 20,20,22 - - lfpsux 16,4,11 - lfpsux 17,5,11 - fpadd 28,28,29 - fpadd 30,30,31 - stfpsux 20,3,11 - fpadd 24,24,26 - stfpsux 24,3,11 - fpadd 28,28,30 - stfpsux 28,3,11 - - li 0,16 # restore call-saved registers - lfpdx 31,0,1 - lfpdux 30,1,0 - lfpdux 29,1,0 - lfpdux 28,1,0 - lfpdux 27,1,0 - lfpdux 26,1,0 - lfpdux 25,1,0 - lfpdux 24,1,0 - lfpdux 23,1,0 - lfpdux 22,1,0 - lfpdux 21,1,0 - lfpdux 20,1,0 - lfpdux 19,1,0 - lfpdux 18,1,0 - lfpdux 17,1,0 - lfpdux 16,1,0 - lfpdux 15,1,0 - lfpdux 14,1,0 - addi 1,1,16 - - blr - - -.global _add_5_single_precision_vectors -_add_5_single_precision_vectors: - - li 0,-16 - stfpdux 14,1,0 - stfpdux 15,1,0 - stfpdux 16,1,0 - stfpdux 17,1,0 - stfpdux 18,1,0 - stfpdux 19,1,0 - stfpdux 24,1,0 - stfpdux 25,1,0 - stfpdux 26,1,0 - stfpdux 27,1,0 - stfpdux 28,1,0 - stfpdux 29,1,0 - - srwi 9,9,3 - li 11,8 - subi 9,9,1 - mtctr 9 - subi 3,3,8 - - cmpwi 0,9,0 - - lfpsx 0,0,4 - lfpsx 1,0,5 - lfpsx 2,0,6 - lfpsx 3,0,7 - lfpsx 4,0,8 - - lfpsux 5,4,11 - lfpsux 6,5,11 - lfpsux 7,6,11 - lfpsux 8,7,11 - lfpsux 9,8,11 - - lfpsux 10,4,11 - lfpsux 11,5,11 - lfpsux 12,6,11 - lfpsux 13,7,11 - lfpsux 14,8,11 - fpadd 24,0,1 - fpadd 25,2,3 - - lfpsux 15,4,11 - lfpsux 16,5,11 - lfpsux 17,6,11 - fpadd 24,24,25 - lfpsux 18,7,11 - lfpsux 19,8,11 - fpadd 27,5,6 - fpadd 28,7,8 - - beq- 0,1f - li 10,48 - -0: - dcbt 4,10 - lfpsux 0,4,11 - fpadd 24,24,4 - lfpsux 1,5,11 - lfpsux 2,6,11 - fpadd 27,27,28 - lfpsux 3,7,11 - lfpsux 4,8,11 - stfpsux 24,3,11 - fpadd 24,10,11 - dcbt 5,10 - fpadd 25,12,13 - - lfpsux 5,4,11 - fpadd 27,27,9 - lfpsux 6,5,11 - lfpsux 7,6,11 - fpadd 24,24,25 - lfpsux 8,7,11 - lfpsux 9,8,11 - stfpsux 27,3,11 - fpadd 27,15,16 - dcbt 6,10 - fpadd 28,17,18 - - lfpsux 10,4,11 - fpadd 24,24,14 - lfpsux 11,5,11 - lfpsux 12,6,11 - fpadd 27,27,28 - lfpsux 13,7,11 - lfpsux 14,8,11 - stfpsux 24,3,11 - fpadd 24,0,1 - dcbt 7,10 - fpadd 25,2,3 - - lfpsux 15,4,11 - fpadd 27,27,19 - lfpsux 16,5,11 - lfpsux 17,6,11 - fpadd 24,24,25 - dcbt 8,10 - lfpsux 18,7,11 - lfpsux 19,8,11 - stfpsux 27,3,11 - fpadd 27,5,6 - fpadd 28,7,8 - - bdnz 0b - -1: - fpadd 24,24,4 - fpadd 27,27,28 - stfpsux 24,3,11 - fpadd 24,10,11 - fpadd 25,12,13 - - fpadd 27,27,9 - fpadd 24,24,25 - stfpsux 27,3,11 - fpadd 27,15,16 - fpadd 28,17,18 - - fpadd 24,24,14 - fpadd 27,27,28 - stfpsux 24,3,11 - - fpadd 27,27,19 - stfpsux 27,3,11 - - li 0,16 # restore call-saved registers - lfpdx 29,0,1 - lfpdux 28,1,0 - lfpdux 27,1,0 - lfpdux 26,1,0 - lfpdux 25,1,0 - lfpdux 24,1,0 - lfpdux 19,1,0 - lfpdux 18,1,0 - lfpdux 17,1,0 - lfpdux 16,1,0 - lfpdux 15,1,0 - lfpdux 14,1,0 - addi 1,1,16 - blr - - -.global _add_6_single_precision_vectors -_add_6_single_precision_vectors: - - li 0,-16 - stfpdux 14,1,0 - stfpdux 15,1,0 - stfpdux 16,1,0 - stfpdux 17,1,0 - stfpdux 18,1,0 - stfpdux 19,1,0 - stfpdux 20,1,0 - stfpdux 21,1,0 - stfpdux 22,1,0 - stfpdux 23,1,0 - stfpdux 24,1,0 - stfpdux 25,1,0 - stfpdux 26,1,0 - stfpdux 27,1,0 - stfpdux 28,1,0 - stfpdux 29,1,0 - - srwi 10,10,3 - li 11,8 - subi 10,10,1 - mtctr 10 - subi 3,3,8 - - cmpwi 0,10,0 - - lfpsx 0,0,4 - lfpsx 1,0,5 - lfpsx 2,0,6 - lfpsx 3,0,7 - lfpsx 4,0,8 - lfpsx 5,0,9 - - lfpsux 6,4,11 - lfpsux 7,5,11 - lfpsux 8,6,11 - lfpsux 9,7,11 - lfpsux 10,8,11 - lfpsux 11,9,11 - - lfpsux 12,4,11 - lfpsux 13,5,11 - lfpsux 14,6,11 - lfpsux 15,7,11 - lfpsux 16,8,11 - lfpsux 17,9,11 - fpadd 24,0,1 - fpadd 25,2,3 - - lfpsux 18,4,11 - lfpsux 19,5,11 - lfpsux 20,6,11 - fpadd 26,4,5 - fpadd 24,24,25 - lfpsux 21,7,11 - lfpsux 22,8,11 - lfpsux 23,9,11 - fpadd 27,6,7 - fpadd 28,8,9 - - beq- 0,1f - li 10,48 - -0: - dcbt 4,10 - lfpsux 0,4,11 - fpadd 24,24,26 - lfpsux 1,5,11 - lfpsux 2,6,11 - fpadd 29,10,11 - fpadd 27,27,28 - lfpsux 3,7,11 - lfpsux 4,8,11 - lfpsux 5,9,11 - dcbt 5,10 - stfpsux 24,3,11 - fpadd 24,12,13 - fpadd 25,14,15 - - lfpsux 6,4,11 - fpadd 27,27,29 - lfpsux 7,5,11 - lfpsux 8,6,11 - dcbt 6,10 - fpadd 26,16,17 - fpadd 24,24,25 - lfpsux 9,7,11 - lfpsux 10,8,11 - lfpsux 11,9,11 - stfpsux 27,3,11 - fpadd 27,18,19 - fpadd 28,20,21 - - dcbt 7,10 - lfpsux 12,4,11 - fpadd 24,24,26 - lfpsux 13,5,11 - lfpsux 14,6,11 - fpadd 29,22,23 - fpadd 27,27,28 - lfpsux 15,7,11 - lfpsux 16,8,11 - lfpsux 17,9,11 - dcbt 8,10 - stfpsux 24,3,11 - fpadd 24,0,1 - fpadd 25,2,3 - - lfpsux 18,4,11 - fpadd 27,27,29 - lfpsux 19,5,11 - lfpsux 20,6,11 - dcbt 9,10 - fpadd 26,4,5 - fpadd 24,24,25 - lfpsux 21,7,11 - lfpsux 22,8,11 - lfpsux 23,9,11 - stfpsux 27,3,11 - fpadd 27,6,7 - fpadd 28,8,9 - - bdnz 0b - -1: - fpadd 24,24,26 - fpadd 29,10,11 - fpadd 27,27,28 - stfpsux 24,3,11 - fpadd 24,12,13 - fpadd 25,14,15 - - fpadd 27,27,29 - fpadd 26,16,17 - fpadd 24,24,25 - stfpsux 27,3,11 - fpadd 27,18,19 - fpadd 28,20,21 - - fpadd 24,24,26 - fpadd 29,22,23 - fpadd 27,27,28 - stfpsux 24,3,11 - - fpadd 27,27,29 - stfpsux 27,3,11 - - li 0,16 # restore call-saved registers - lfpdx 29,0,1 - lfpdux 28,1,0 - lfpdux 27,1,0 - lfpdux 26,1,0 - lfpdux 25,1,0 - lfpdux 24,1,0 - lfpdux 23,1,0 - lfpdux 22,1,0 - lfpdux 21,1,0 - lfpdux 20,1,0 - lfpdux 19,1,0 - lfpdux 18,1,0 - lfpdux 17,1,0 - lfpdux 16,1,0 - lfpdux 15,1,0 - lfpdux 14,1,0 - addi 1,1,16 - blr - - .align 5 .global _beamform_3stations_6beams diff --git a/RTCP/CNProc/src/BeamFormerAsm.h b/RTCP/CNProc/src/BeamFormerAsm.h index 88f8c136c0dfe0fff1ff3095125128e9cf4e9bde..2a8399e10dcd7e39d0f33354be9b8c19dd8afc44 100644 --- a/RTCP/CNProc/src/BeamFormerAsm.h +++ b/RTCP/CNProc/src/BeamFormerAsm.h @@ -3,6 +3,7 @@ #if defined HAVE_BGP +#include <Interface/BGPAsm.h> #include <cstring> namespace LOFAR { @@ -10,53 +11,6 @@ namespace RTCP { extern "C" { -// all float * must be aligned to 8 bytes - -void _add_2_single_precision_vectors( - /* r3 */ float *dst, - /* r4 */ const float *src1, - /* r5 */ const float *src2, - /* r6 */ unsigned count /* non-zero; multiple of 16 */ -); - -void _add_3_single_precision_vectors( - /* r3 */ float *dst, - /* r4 */ const float *src1, - /* r5 */ const float *src2, - /* r6 */ const float *src3, - /* r7 */ unsigned count /* non-zero; multiple of 16 */ -); - -void _add_4_single_precision_vectors( - /* r3 */ float *dst, - /* r4 */ const float *src1, - /* r5 */ const float *src2, - /* r6 */ const float *src3, - /* r7 */ const float *src4, - /* r8 */ unsigned count /* non-zero; multiple of 16 */ -); - -void _add_5_single_precision_vectors( - /* r3 */ float *dst, - /* r4 */ const float *src1, - /* r5 */ const float *src2, - /* r6 */ const float *src3, - /* r7 */ const float *src4, - /* r8 */ const float *src5, - /* r9 */ unsigned count /* non-zero; multiple of 16 */ -); - -void _add_6_single_precision_vectors( - /* r3 */ float *dst, - /* r4 */ const float *src1, - /* r5 */ const float *src2, - /* r6 */ const float *src3, - /* r7 */ const float *src4, - /* r8 */ const float *src5, - /* r9 */ const float *src6, - /* r10 */ unsigned count /* non-zero; multiple of 16 */ -); - #if 0 void _beamform_3beams( /* r3 */ fcomplex *dst, @@ -134,33 +88,6 @@ void _beamform_6beams_2times( } // extern "C" -// Similar functions that do not need or have an ASM version - -// defined just to aid the use of macros -static inline void _add_1_single_precision_vectors( - float *dst, - const float *src1, - unsigned count /* non-zero; multiple of 16 */ -) { - // nothing to add, so just copy the values - memcpy( dst, src1, count * sizeof(float) ); -} - -static inline void _add_7_single_precision_vectors( - float *dst, - const float *src1, - const float *src2, - const float *src3, - const float *src4, - const float *src5, - const float *src6, - const float *src7, - unsigned count /* non-zero; multiple of 16 */ -) { - _add_4_single_precision_vectors( dst, src1, src2, src3, src4, count ); - _add_4_single_precision_vectors( dst, dst, src5, src6, src7, count ); -} - } // namespace LOFAR::RTCP } // namespace LOFAR diff --git a/RTCP/Interface/CMakeLists.txt b/RTCP/Interface/CMakeLists.txt index 93c6b119ba8eb0ea13d3d27763f959b20293f5f9..2b21d242021b5022ae698ce1ad7c4ce81ddc1d38 100644 --- a/RTCP/Interface/CMakeLists.txt +++ b/RTCP/Interface/CMakeLists.txt @@ -10,6 +10,20 @@ if(USE_VALGRIND) add_definitions(-DUSE_VALGRIND) endif(USE_VALGRIND) +## --------------------------------------------------------------------------- +## Enable BGP specific assembler. +## Use the BGP assembler also for linking C/C++ programs. +## +## NOTE: Maybe this should be moved to the variants file. That requires that +## CN and ION are no longer variants; we then need two compiler definitions: +## BGPCN and BGPION, which is a cleaner solution anyway. +## --------------------------------------------------------------------------- +enable_language(ASM-BGP) +if(LOFAR_COMPILER_SUITE STREQUAL "BGPCN") + set(CMAKE_C_LINK_EXECUTABLE ${CMAKE_ASM-BGP_LINK_EXECUTABLE}) + set(CMAKE_CXX_LINK_EXECUTABLE ${CMAKE_ASM-BGP_LINK_EXECUTABLE}) +endif(LOFAR_COMPILER_SUITE STREQUAL "BGPCN") + add_subdirectory(include/Interface) add_subdirectory(src) add_subdirectory(test) diff --git a/RTCP/Interface/include/Interface/BGPAsm.h b/RTCP/Interface/include/Interface/BGPAsm.h new file mode 100644 index 0000000000000000000000000000000000000000..8a1c9a2faae1cd9fc318ae5aa670d8680cc26094 --- /dev/null +++ b/RTCP/Interface/include/Interface/BGPAsm.h @@ -0,0 +1,94 @@ +#ifndef LOFAR_INTERFACE_BGPASM_H +#define LOFAR_INTERFACE_BGPASM_H + +#if defined HAVE_BGP + +#include <cstring> + +namespace LOFAR { +namespace RTCP { + +extern "C" { + +// all float * must be aligned to 8 bytes + +void _add_2_single_precision_vectors( + /* r3 */ float *dst, + /* r4 */ const float *src1, + /* r5 */ const float *src2, + /* r6 */ unsigned count /* non-zero; multiple of 16 */ +); + +void _add_3_single_precision_vectors( + /* r3 */ float *dst, + /* r4 */ const float *src1, + /* r5 */ const float *src2, + /* r6 */ const float *src3, + /* r7 */ unsigned count /* non-zero; multiple of 16 */ +); + +void _add_4_single_precision_vectors( + /* r3 */ float *dst, + /* r4 */ const float *src1, + /* r5 */ const float *src2, + /* r6 */ const float *src3, + /* r7 */ const float *src4, + /* r8 */ unsigned count /* non-zero; multiple of 16 */ +); + +void _add_5_single_precision_vectors( + /* r3 */ float *dst, + /* r4 */ const float *src1, + /* r5 */ const float *src2, + /* r6 */ const float *src3, + /* r7 */ const float *src4, + /* r8 */ const float *src5, + /* r9 */ unsigned count /* non-zero; multiple of 16 */ +); + +void _add_6_single_precision_vectors( + /* r3 */ float *dst, + /* r4 */ const float *src1, + /* r5 */ const float *src2, + /* r6 */ const float *src3, + /* r7 */ const float *src4, + /* r8 */ const float *src5, + /* r9 */ const float *src6, + /* r10 */ unsigned count /* non-zero; multiple of 16 */ +); + +} // extern "C" + +// Similar functions that do not need or have an ASM version + +// defined just to aid the use of macros +static inline void _add_1_single_precision_vectors( + float *dst, + const float *src1, + unsigned count /* non-zero; multiple of 16 */ +) { + // nothing to add, so just copy the values + memcpy( dst, src1, count * sizeof(float) ); +} + +static inline void _add_7_single_precision_vectors( + float *dst, + const float *src1, + const float *src2, + const float *src3, + const float *src4, + const float *src5, + const float *src6, + const float *src7, + unsigned count /* non-zero; multiple of 16 */ +) { + _add_4_single_precision_vectors( dst, src1, src2, src3, src4, count ); + _add_4_single_precision_vectors( dst, dst, src5, src6, src7, count ); +} + +} // namespace LOFAR::RTCP +} // namespace LOFAR + +#endif + +#endif diff --git a/RTCP/Interface/include/Interface/CMakeLists.txt b/RTCP/Interface/include/Interface/CMakeLists.txt index 655c0f5711b431f1af8ed204f7789392f93fdbe0..abec53f584b7e2d5d3159c0f5f945a672ce47b74 100644 --- a/RTCP/Interface/include/Interface/CMakeLists.txt +++ b/RTCP/Interface/include/Interface/CMakeLists.txt @@ -5,6 +5,7 @@ set(inst_HEADERS Allocator.h BeamCoordinates.h BeamFormedData.h + BGPAsm.h CN_Command.h CN_Mapping.h Stream.h diff --git a/RTCP/Interface/include/Interface/CorrelatedData.h b/RTCP/Interface/include/Interface/CorrelatedData.h index 381b629927ee9c8504de79b32bb637eb66839055..2c7df06ba211e11c776da6ec6bd04ddf190c2edb 100644 --- a/RTCP/Interface/include/Interface/CorrelatedData.h +++ b/RTCP/Interface/include/Interface/CorrelatedData.h @@ -7,6 +7,7 @@ #include <Interface/Allocator.h> #include <Interface/Config.h> #include <Interface/StreamableData.h> +#include <Interface/BGPAsm.h> #include <Interface/MultiDimArray.h> #include <Stream/Stream.h> @@ -122,13 +123,31 @@ inline void CorrelatedData::writeData(Stream *str) } -template <typename T> inline void addNrValidSamples(T *dst, const T *src, unsigned count) +template <typename T> inline void addNrValidSamples(T * __restrict__ dst, const T * __restrict__ src, unsigned count) { for (unsigned i = 0; i < count; i ++) dst[i] += src[i]; } +template<> inline void addNrValidSamples<uint16_t>(uint16_t * __restrict__ dst, const uint16_t * __restrict__ src, unsigned count) +{ + addNrValidSamples<uint32_t>(reinterpret_cast<uint32_t*>(dst), reinterpret_cast<const uint32_t*>(src), count / 2); + + if (count & 1) + dst[count - 1] += src[count - 1]; +} + + +template<> inline void addNrValidSamples<uint8_t>(uint8_t * __restrict__ dst, const uint8_t * __restrict__ src, unsigned count) +{ + addNrValidSamples<uint16_t>(reinterpret_cast<uint16_t*>(dst), reinterpret_cast<const uint16_t*>(src), count / 2); + + if (count & 1) + dst[count - 1] += src[count - 1]; +} + + inline IntegratableData &CorrelatedData::operator += (const IntegratableData &other_) { const CorrelatedData &other = static_cast<const CorrelatedData &>(other_); @@ -138,9 +157,19 @@ inline IntegratableData &CorrelatedData::operator += (const IntegratableData &ot fcomplex *dst = visibilities.origin(); const fcomplex *src = other.visibilities.origin(); unsigned count = visibilities.num_elements(); +#ifdef HAVE_BGP + unsigned fastcopyfloats = (count * 2) & ~0xF; + unsigned remainder = count % 8; + for (unsigned i = 0; i < remainder; i ++) + dst[i] += src[i]; + + if (fastcopyfloats > 0) + _add_2_single_precision_vectors( reinterpret_cast<float*>(dst + remainder), reinterpret_cast<float*>(dst + remainder), reinterpret_cast<const float*>(src + remainder), fastcopyfloats ); +#else for (unsigned i = 0; i < count; i ++) dst[i] += src[i]; +#endif } // add nr. valid samples diff --git a/RTCP/Interface/src/BGPAsm.S b/RTCP/Interface/src/BGPAsm.S new file mode 100644 index 0000000000000000000000000000000000000000..db7a73cbd830a19320174eabf1b63954fc07cdfe --- /dev/null +++ b/RTCP/Interface/src/BGPAsm.S @@ -0,0 +1,762 @@ +#if defined HAVE_BGP + +.global _add_2_single_precision_vectors +_add_2_single_precision_vectors: + + li 0,-16 + stfpdux 14,1,0 + stfpdux 15,1,0 + + srwi 6,6,4 + li 11,8 + subi 6,6,1 + mtctr 6 + + cmpwi 0,6,0 + lfpsx 0,0,4 + lfpsx 1,0,5 + lfpsux 2,4,11 + lfpsux 3,5,11 + lfpsux 4,4,11 + lfpsux 5,5,11 + lfpsux 6,4,11 + lfpsux 7,5,11 + lfpsux 8,4,11 + lfpsux 9,5,11 + lfpsux 10,4,11 + lfpsux 11,5,11 + fpadd 0,0,1 + lfpsux 12,4,11 + lfpsux 13,5,11 + fpadd 2,2,3 + lfpsux 14,4,11 + lfpsux 15,5,11 + stfpsx 0,0,3 + fpadd 4,4,5 + beq- 0,1f + +0: lfpsux 0,4,11 + lfpsux 1,5,11 + stfpsux 2,3,11 + fpadd 6,6,7 + lfpsux 2,4,11 + lfpsux 3,5,11 + stfpsux 4,3,11 + fpadd 8,8,9 + lfpsux 4,4,11 + lfpsux 5,5,11 + stfpsux 6,3,11 + fpadd 10,10,11 + lfpsux 6,4,11 + lfpsux 7,5,11 + stfpsux 8,3,11 + fpadd 12,12,13 + lfpsux 8,4,11 + lfpsux 9,5,11 + stfpsux 10,3,11 + fpadd 14,14,15 + lfpsux 10,4,11 + lfpsux 11,5,11 + stfpsux 12,3,11 + fpadd 0,0,1 + lfpsux 12,4,11 + lfpsux 13,5,11 + stfpsux 14,3,11 + fpadd 2,2,3 + lfpsux 14,4,11 + lfpsux 15,5,11 + stfpsux 0,3,11 + fpadd 4,4,5 + bdnz 0b + +1: stfpsux 2,3,11 + fpadd 6,6,7 + stfpsux 4,3,11 + fpadd 8,8,9 + stfpsux 6,3,11 + fpadd 10,10,11 + stfpsux 8,3,11 + fpadd 12,12,13 + stfpsux 10,3,11 + fpadd 14,14,15 + stfpsux 12,3,11 + stfpsux 14,3,11 + + li 0,16 # restore call-saved registers + lfpdx 15,0,1 + lfpdux 14,1,0 + addi 1,1,16 + + blr + + +.global _add_3_single_precision_vectors +_add_3_single_precision_vectors: + + li 0,-16 + stfpdux 14,1,0 + stfpdux 15,1,0 + stfpdux 16,1,0 + stfpdux 17,1,0 + stfpdux 18,1,0 + stfpdux 19,1,0 + stfpdux 20,1,0 + stfpdux 21,1,0 + stfpdux 22,1,0 + stfpdux 23,1,0 + + srwi 7,7,4 + li 11,8 + subi 7,7,1 + mtctr 7 + + cmpwi 0,7,0 + lfpsx 0,0,4 + lfpsx 1,0,5 + lfpsx 2,0,6 + lfpsux 3,4,11 + lfpsux 4,5,11 + lfpsux 5,6,11 + lfpsux 6,4,11 + lfpsux 7,5,11 + lfpsux 8,6,11 + lfpsux 9,4,11 + lfpsux 10,5,11 + lfpsux 11,6,11 + lfpsux 12,4,11 + lfpsux 13,5,11 + lfpsux 14,6,11 + lfpsux 15,4,11 + lfpsux 16,5,11 + fpadd 0,0,1 + lfpsux 17,6,11 + lfpsux 18,4,11 + lfpsux 19,5,11 + fpadd 3,3,4 + lfpsux 20,6,11 + fpadd 0,0,2 + lfpsux 21,4,11 + lfpsux 22,5,11 + fpadd 6,6,7 + lfpsux 23,6,11 + fpadd 3,3,5 + stfpsx 0,0,3 + beq- 0,1f + li 10,32 + +0: lfpsux 0,4,11 + lfpsux 1,5,11 + dcbt 4,10 + fpadd 9,9,10 + lfpsux 2,6,11 + fpadd 6,6,8 + stfpsux 3,3,11 + lfpsux 3,4,11 + lfpsux 4,5,11 + fpadd 12,12,13 + lfpsux 5,6,11 + fpadd 9,9,11 + stfpsux 6,3,11 + lfpsux 6,4,11 + lfpsux 7,5,11 + fpadd 15,15,16 + lfpsux 8,6,11 + fpadd 12,12,14 + stfpsux 9,3,11 + lfpsux 9,4,11 + lfpsux 10,5,11 + fpadd 18,18,19 + lfpsux 11,6,11 + fpadd 15,15,17 + stfpsux 12,3,11 + lfpsux 12,4,11 + lfpsux 13,5,11 + fpadd 21,21,22 + lfpsux 14,6,11 + fpadd 18,18,20 + stfpsux 15,3,11 + lfpsux 15,4,11 + lfpsux 16,5,11 + fpadd 0,0,1 + lfpsux 17,6,11 + fpadd 21,21,23 + stfpsux 18,3,11 + lfpsux 18,4,11 + lfpsux 19,5,11 + fpadd 3,3,4 + lfpsux 20,6,11 + fpadd 0,0,2 + stfpsux 21,3,11 + lfpsux 21,4,11 + lfpsux 22,5,11 + fpadd 6,6,7 + lfpsux 23,6,11 + fpadd 3,3,5 + stfpsux 0,3,11 + bdnz 0b + +1: fpadd 9,9,10 + fpadd 6,6,8 + stfpsux 3,3,11 + fpadd 12,12,13 + fpadd 9,9,11 + stfpsux 6,3,11 + fpadd 15,15,16 + fpadd 12,12,14 + stfpsux 9,3,11 + fpadd 18,18,19 + fpadd 15,15,17 + stfpsux 12,3,11 + fpadd 21,21,22 + fpadd 18,18,20 + stfpsux 15,3,11 + fpadd 21,21,23 + stfpsux 18,3,11 + stfpsux 21,3,11 + + li 0,16 # restore call-saved registers + lfpdx 23,0,1 + lfpdux 22,1,0 + lfpdux 21,1,0 + lfpdux 20,1,0 + lfpdux 19,1,0 + lfpdux 18,1,0 + lfpdux 17,1,0 + lfpdux 16,1,0 + lfpdux 15,1,0 + lfpdux 14,1,0 + addi 1,1,16 + + blr + + +.global _add_4_single_precision_vectors +_add_4_single_precision_vectors: + + li 0,-16 + stfpdux 14,1,0 + stfpdux 15,1,0 + stfpdux 16,1,0 + stfpdux 17,1,0 + stfpdux 18,1,0 + stfpdux 19,1,0 + stfpdux 20,1,0 + stfpdux 21,1,0 + stfpdux 22,1,0 + stfpdux 23,1,0 + stfpdux 24,1,0 + stfpdux 25,1,0 + stfpdux 26,1,0 + stfpdux 27,1,0 + stfpdux 28,1,0 + stfpdux 29,1,0 + stfpdux 30,1,0 + stfpdux 31,1,0 + + srwi 8,8,4 + li 11,8 + subi 8,8,1 + mtctr 8 + + cmpwi 0,8,0 + lfpsx 0,0,4 + lfpsx 1,0,5 + lfpsx 2,0,6 + lfpsx 3,0,7 + lfpsux 4,4,11 + lfpsux 5,5,11 + lfpsux 6,6,11 + lfpsux 7,7,11 + lfpsux 8,4,11 + lfpsux 9,5,11 + lfpsux 10,6,11 + lfpsux 11,7,11 + lfpsux 12,4,11 + lfpsux 13,5,11 + lfpsux 14,6,11 + lfpsux 15,7,11 + lfpsux 16,4,11 + lfpsux 17,5,11 + lfpsux 18,6,11 + lfpsux 19,7,11 + lfpsux 20,4,11 + lfpsux 21,5,11 + fpadd 0,0,1 + fpadd 2,2,3 + lfpsux 22,6,11 + lfpsux 23,7,11 + lfpsux 24,4,11 + lfpsux 25,5,11 + fpadd 4,4,5 + fpadd 6,6,7 + lfpsux 26,6,11 + lfpsux 27,7,11 + fpadd 0,0,2 + lfpsux 28,4,11 + lfpsux 29,5,11 + fpadd 8,8,9 + fpadd 10,10,11 + lfpsux 30,6,11 + lfpsux 31,7,11 + + stfpsx 0,0,3 + fpadd 4,4,6 + beq- 0,1f + +0: lfpsux 0,4,11 + lfpsux 1,5,11 + fpadd 12,12,13 + fpadd 14,14,15 + lfpsux 2,6,11 + lfpsux 3,7,11 + + stfpsux 4,3,11 + fpadd 8,8,10 + + lfpsux 4,4,11 + lfpsux 5,5,11 + fpadd 16,16,17 + fpadd 18,18,19 + lfpsux 6,6,11 + lfpsux 7,7,11 + + stfpsux 8,3,11 + fpadd 12,12,14 + + lfpsux 8,4,11 + lfpsux 9,5,11 + fpadd 20,20,21 + fpadd 22,22,23 + lfpsux 10,6,11 + lfpsux 11,7,11 + + stfpsux 12,3,11 + fpadd 16,16,18 + + lfpsux 12,4,11 + lfpsux 13,5,11 + fpadd 24,24,25 + fpadd 26,26,27 + lfpsux 14,6,11 + lfpsux 15,7,11 + + stfpsux 16,3,11 + fpadd 20,20,22 + + lfpsux 16,4,11 + lfpsux 17,5,11 + fpadd 28,28,29 + fpadd 30,30,31 + lfpsux 18,6,11 + lfpsux 19,7,11 + + stfpsux 20,3,11 + fpadd 24,24,26 + + lfpsux 20,4,11 + lfpsux 21,5,11 + fpadd 0,0,1 + fpadd 2,2,3 + lfpsux 22,6,11 + lfpsux 23,7,11 + + stfpsux 24,3,11 + fpadd 28,28,30 + + lfpsux 24,4,11 + lfpsux 25,5,11 + fpadd 4,4,5 + fpadd 6,6,7 + lfpsux 26,6,11 + lfpsux 27,7,11 + + stfpsux 28,3,11 + fpadd 0,0,2 + + lfpsux 28,4,11 + lfpsux 29,5,11 + fpadd 8,8,9 + fpadd 10,10,11 + lfpsux 30,6,11 + lfpsux 31,7,11 + + stfpsux 0,3,11 + fpadd 4,4,6 + + bdnz 0b + +1: fpadd 12,12,13 + fpadd 14,14,15 + + stfpsux 4,3,11 + fpadd 8,8,10 + + fpadd 16,16,17 + fpadd 18,18,19 + + stfpsux 8,3,11 + fpadd 12,12,14 + + fpadd 20,20,21 + fpadd 22,22,23 + + stfpsux 12,3,11 + fpadd 16,16,18 + + fpadd 24,24,25 + fpadd 26,26,27 + + stfpsux 16,3,11 + fpadd 20,20,22 + + lfpsux 16,4,11 + lfpsux 17,5,11 + fpadd 28,28,29 + fpadd 30,30,31 + stfpsux 20,3,11 + fpadd 24,24,26 + stfpsux 24,3,11 + fpadd 28,28,30 + stfpsux 28,3,11 + + li 0,16 # restore call-saved registers + lfpdx 31,0,1 + lfpdux 30,1,0 + lfpdux 29,1,0 + lfpdux 28,1,0 + lfpdux 27,1,0 + lfpdux 26,1,0 + lfpdux 25,1,0 + lfpdux 24,1,0 + lfpdux 23,1,0 + lfpdux 22,1,0 + lfpdux 21,1,0 + lfpdux 20,1,0 + lfpdux 19,1,0 + lfpdux 18,1,0 + lfpdux 17,1,0 + lfpdux 16,1,0 + lfpdux 15,1,0 + lfpdux 14,1,0 + addi 1,1,16 + + blr + + +.global _add_5_single_precision_vectors +_add_5_single_precision_vectors: + + li 0,-16 + stfpdux 14,1,0 + stfpdux 15,1,0 + stfpdux 16,1,0 + stfpdux 17,1,0 + stfpdux 18,1,0 + stfpdux 19,1,0 + stfpdux 24,1,0 + stfpdux 25,1,0 + stfpdux 26,1,0 + stfpdux 27,1,0 + stfpdux 28,1,0 + stfpdux 29,1,0 + + srwi 9,9,3 + li 11,8 + subi 9,9,1 + mtctr 9 + subi 3,3,8 + + cmpwi 0,9,0 + + lfpsx 0,0,4 + lfpsx 1,0,5 + lfpsx 2,0,6 + lfpsx 3,0,7 + lfpsx 4,0,8 + + lfpsux 5,4,11 + lfpsux 6,5,11 + lfpsux 7,6,11 + lfpsux 8,7,11 + lfpsux 9,8,11 + + lfpsux 10,4,11 + lfpsux 11,5,11 + lfpsux 12,6,11 + lfpsux 13,7,11 + lfpsux 14,8,11 + fpadd 24,0,1 + fpadd 25,2,3 + + lfpsux 15,4,11 + lfpsux 16,5,11 + lfpsux 17,6,11 + fpadd 24,24,25 + lfpsux 18,7,11 + lfpsux 19,8,11 + fpadd 27,5,6 + fpadd 28,7,8 + + beq- 0,1f + li 10,48 + +0: + dcbt 4,10 + lfpsux 0,4,11 + fpadd 24,24,4 + lfpsux 1,5,11 + lfpsux 2,6,11 + fpadd 27,27,28 + lfpsux 3,7,11 + lfpsux 4,8,11 + stfpsux 24,3,11 + fpadd 24,10,11 + dcbt 5,10 + fpadd 25,12,13 + + lfpsux 5,4,11 + fpadd 27,27,9 + lfpsux 6,5,11 + lfpsux 7,6,11 + fpadd 24,24,25 + lfpsux 8,7,11 + lfpsux 9,8,11 + stfpsux 27,3,11 + fpadd 27,15,16 + dcbt 6,10 + fpadd 28,17,18 + + lfpsux 10,4,11 + fpadd 24,24,14 + lfpsux 11,5,11 + lfpsux 12,6,11 + fpadd 27,27,28 + lfpsux 13,7,11 + lfpsux 14,8,11 + stfpsux 24,3,11 + fpadd 24,0,1 + dcbt 7,10 + fpadd 25,2,3 + + lfpsux 15,4,11 + fpadd 27,27,19 + lfpsux 16,5,11 + lfpsux 17,6,11 + fpadd 24,24,25 + dcbt 8,10 + lfpsux 18,7,11 + lfpsux 19,8,11 + stfpsux 27,3,11 + fpadd 27,5,6 + fpadd 28,7,8 + + bdnz 0b + +1: + fpadd 24,24,4 + fpadd 27,27,28 + stfpsux 24,3,11 + fpadd 24,10,11 + fpadd 25,12,13 + + fpadd 27,27,9 + fpadd 24,24,25 + stfpsux 27,3,11 + fpadd 27,15,16 + fpadd 28,17,18 + + fpadd 24,24,14 + fpadd 27,27,28 + stfpsux 24,3,11 + + fpadd 27,27,19 + stfpsux 27,3,11 + + li 0,16 # restore call-saved registers + lfpdx 29,0,1 + lfpdux 28,1,0 + lfpdux 27,1,0 + lfpdux 26,1,0 + lfpdux 25,1,0 + lfpdux 24,1,0 + lfpdux 19,1,0 + lfpdux 18,1,0 + lfpdux 17,1,0 + lfpdux 16,1,0 + lfpdux 15,1,0 + lfpdux 14,1,0 + addi 1,1,16 + blr + + +.global _add_6_single_precision_vectors +_add_6_single_precision_vectors: + + li 0,-16 + stfpdux 14,1,0 + stfpdux 15,1,0 + stfpdux 16,1,0 + stfpdux 17,1,0 + stfpdux 18,1,0 + stfpdux 19,1,0 + stfpdux 20,1,0 + stfpdux 21,1,0 + stfpdux 22,1,0 + stfpdux 23,1,0 + stfpdux 24,1,0 + stfpdux 25,1,0 + stfpdux 26,1,0 + stfpdux 27,1,0 + stfpdux 28,1,0 + stfpdux 29,1,0 + + srwi 10,10,3 + li 11,8 + subi 10,10,1 + mtctr 10 + subi 3,3,8 + + cmpwi 0,10,0 + + lfpsx 0,0,4 + lfpsx 1,0,5 + lfpsx 2,0,6 + lfpsx 3,0,7 + lfpsx 4,0,8 + lfpsx 5,0,9 + + lfpsux 6,4,11 + lfpsux 7,5,11 + lfpsux 8,6,11 + lfpsux 9,7,11 + lfpsux 10,8,11 + lfpsux 11,9,11 + + lfpsux 12,4,11 + lfpsux 13,5,11 + lfpsux 14,6,11 + lfpsux 15,7,11 + lfpsux 16,8,11 + lfpsux 17,9,11 + fpadd 24,0,1 + fpadd 25,2,3 + + lfpsux 18,4,11 + lfpsux 19,5,11 + lfpsux 20,6,11 + fpadd 26,4,5 + fpadd 24,24,25 + lfpsux 21,7,11 + lfpsux 22,8,11 + lfpsux 23,9,11 + fpadd 27,6,7 + fpadd 28,8,9 + + beq- 0,1f + li 10,48 + +0: + dcbt 4,10 + lfpsux 0,4,11 + fpadd 24,24,26 + lfpsux 1,5,11 + lfpsux 2,6,11 + fpadd 29,10,11 + fpadd 27,27,28 + lfpsux 3,7,11 + lfpsux 4,8,11 + lfpsux 5,9,11 + dcbt 5,10 + stfpsux 24,3,11 + fpadd 24,12,13 + fpadd 25,14,15 + + lfpsux 6,4,11 + fpadd 27,27,29 + lfpsux 7,5,11 + lfpsux 8,6,11 + dcbt 6,10 + fpadd 26,16,17 + fpadd 24,24,25 + lfpsux 9,7,11 + lfpsux 10,8,11 + lfpsux 11,9,11 + stfpsux 27,3,11 + fpadd 27,18,19 + fpadd 28,20,21 + + dcbt 7,10 + lfpsux 12,4,11 + fpadd 24,24,26 + lfpsux 13,5,11 + lfpsux 14,6,11 + fpadd 29,22,23 + fpadd 27,27,28 + lfpsux 15,7,11 + lfpsux 16,8,11 + lfpsux 17,9,11 + dcbt 8,10 + stfpsux 24,3,11 + fpadd 24,0,1 + fpadd 25,2,3 + + lfpsux 18,4,11 + fpadd 27,27,29 + lfpsux 19,5,11 + lfpsux 20,6,11 + dcbt 9,10 + fpadd 26,4,5 + fpadd 24,24,25 + lfpsux 21,7,11 + lfpsux 22,8,11 + lfpsux 23,9,11 + stfpsux 27,3,11 + fpadd 27,6,7 + fpadd 28,8,9 + + bdnz 0b + +1: + fpadd 24,24,26 + fpadd 29,10,11 + fpadd 27,27,28 + stfpsux 24,3,11 + fpadd 24,12,13 + fpadd 25,14,15 + + fpadd 27,27,29 + fpadd 26,16,17 + fpadd 24,24,25 + stfpsux 27,3,11 + fpadd 27,18,19 + fpadd 28,20,21 + + fpadd 24,24,26 + fpadd 29,22,23 + fpadd 27,27,28 + stfpsux 24,3,11 + + fpadd 27,27,29 + stfpsux 27,3,11 + + li 0,16 # restore call-saved registers + lfpdx 29,0,1 + lfpdux 28,1,0 + lfpdux 27,1,0 + lfpdux 26,1,0 + lfpdux 25,1,0 + lfpdux 24,1,0 + lfpdux 23,1,0 + lfpdux 22,1,0 + lfpdux 21,1,0 + lfpdux 20,1,0 + lfpdux 19,1,0 + lfpdux 18,1,0 + lfpdux 17,1,0 + lfpdux 16,1,0 + lfpdux 15,1,0 + lfpdux 14,1,0 + addi 1,1,16 + blr + +#endif diff --git a/RTCP/Interface/src/CMakeLists.txt b/RTCP/Interface/src/CMakeLists.txt index cb2cf20ff67b02c8d0c6f0ff2ad075aa54795708..40f0e69fd95c1b36ba982509c03308941ba063e5 100644 --- a/RTCP/Interface/src/CMakeLists.txt +++ b/RTCP/Interface/src/CMakeLists.txt @@ -2,7 +2,7 @@ include(LofarPackageVersion) -lofar_add_library(interface +set(interface_LIB_SRCS Package__Version.cc Allocator.cc BeamCoordinates.cc @@ -12,4 +12,16 @@ lofar_add_library(interface Parset.cc RSPTimeStamp.cc) +# Maybe we shoud use LOFAR_COMPILER_SUITE, because ASM-BGP_COMPILER_WORKS +# is not reliable: i.e. +#if(LOFAR_COMPILER_SUITE STREQUAL BGPCN) +if(CMAKE_ASM-BGP_COMPILER_WORKS) + list(APPEND interface_LIB_SRCS + BGPAsm.S) +endif(CMAKE_ASM-BGP_COMPILER_WORKS) +#endif(LOFAR_COMPILER_SUITE STREQUAL BGPCN) + +lofar_add_library(interface ${interface_LIB_SRCS}) + lofar_add_bin_program(versioninterface versioninterface.cc) + diff --git a/RTCP/Interface/test/CMakeLists.txt b/RTCP/Interface/test/CMakeLists.txt index 391e1a16d323a55db0ecee4751f9b3fa7735eef0..53e188beba7a6064ed65110ddc9f91027b7696d2 100644 --- a/RTCP/Interface/test/CMakeLists.txt +++ b/RTCP/Interface/test/CMakeLists.txt @@ -2,5 +2,6 @@ include(LofarCTest) +lofar_add_test(tCorrelatedData tCorrelatedData.cc) lofar_add_test(tSparseSet tSparseSet.cc) lofar_add_test(tRSPTimeStamp tRSPTimeStamp.cc) diff --git a/RTCP/Interface/test/tCorrelatedData.cc b/RTCP/Interface/test/tCorrelatedData.cc new file mode 100644 index 0000000000000000000000000000000000000000..ded003c78f00b668efc71043d6c96214a9ef5a0a --- /dev/null +++ b/RTCP/Interface/test/tCorrelatedData.cc @@ -0,0 +1,86 @@ +#include <lofar_config.h> + +#include <Common/Timer.h> + +#include <Interface/CorrelatedData.h> + +#include <cassert> +#include <iostream> + + +using namespace LOFAR; +using namespace LOFAR::RTCP; +using namespace std; + +int main(void) +{ + NSTimer timer("addition", true, false); + + unsigned nr_maxsamples[] = { 255, 65535, 1000000 }; // encode using 1, 2, 4 bytes, respectively + unsigned nr_channels[] = { 1, 16, 64, 256 }; + unsigned nr_stations[] = { 1, 2, 3, 4, 5, 24 }; + + for( unsigned s = 0; s < sizeof nr_maxsamples / sizeof nr_maxsamples[0]; ++s ) + for( unsigned ch = 0; ch < sizeof nr_channels / sizeof nr_channels[0]; ++ch ) + for( unsigned st = 0; st < sizeof nr_stations / sizeof nr_stations[0]; ++st ) { + unsigned ns = nr_maxsamples[s]; + unsigned nch = nr_channels[ch]; + unsigned nst = nr_stations[st]; + unsigned nbl = nst * (nst + 1) / 2; + + cout << nst << " stations (= " << nbl << " baselines), " << nch << " channels, " << ns << " samples" << endl; + + // we will test whether data1 + data2 = data3 + CorrelatedData data1(nst, nch, ns), data2(nst, nch, ns), data3(nst, nch, ns); + + // initialise data + cout << "init" << endl; + unsigned n = 0; + + for( unsigned i = 0; i < nbl; i++ ) { + for( unsigned j = 0; j < nch; j++ ) { + n++; + + data1.setNrValidSamples(i, j, (n*1) % (ns/2)); + data2.setNrValidSamples(i, j, (n*2) % (ns/2)); + data3.setNrValidSamples(i, j, ((n*1) % (ns/2)) + ((n*2) % (ns/2))); + + for( unsigned k = 0; k < 2; k++ ) { + for( unsigned l = 0; l < 2; l++ ) { + data1.visibilities[i][j][k][l] = 1 * ((i + j) * 10 + k * 2 + l); + data2.visibilities[i][j][k][l] = 1000 * ((i + j) * 10 + k * 2 + l); + data3.visibilities[i][j][k][l] = 1001 * ((i + j) * 10 + k * 2 + l); + } + } + } + } + + // add + cout << "add" << endl; + timer.start(); + data1 += data2; + timer.stop(); + + // verify + cout << "verify" << endl; + for( unsigned i = 0; i < nbl; i++ ) { + for( unsigned j = 0; j < nch; j++ ) { + //cout << data1.nrValidSamples(i, j) << " == " << data3.nrValidSamples(i, j) << endl; + assert(data1.nrValidSamples(i, j) == data3.nrValidSamples(i, j)); + + for( unsigned k = 0; k < 2; k++ ) { + for( unsigned l = 0; l < 2; l++ ) { + assert( + data1.visibilities[i][j][k][l] == + data3.visibilities[i][j][k][l] + ); + } + } + } + } + + cout << "ok" << endl; + } + + return 0; +} diff --git a/RTCP/Run/src/OLAP.parset b/RTCP/Run/src/OLAP.parset index 2d586feaf842887921859515f83ca795fb8c427b..7e5e4860669831978de29881c93a57723180ad5a 100644 --- a/RTCP/Run/src/OLAP.parset +++ b/RTCP/Run/src/OLAP.parset @@ -37,10 +37,10 @@ PIC.Core.CS007HBA0.clockCorrectionTime = 7.913020e-06 PIC.Core.CS007HBA1.clockCorrectionTime = 7.913260e-06 PIC.Core.CS007HBA.clockCorrectionTime = 7.913140e-06 -PIC.Core.CS001LBA.clockCorrectionTime = 4.309154e-06 -PIC.Core.CS001HBA0.clockCorrectionTime = 4.309154e-06 -PIC.Core.CS001HBA1.clockCorrectionTime = 4.309154e-06 -PIC.Core.CS001HBA.clockCorrectionTime = 4.309154e-06 +PIC.Core.CS001LBA.clockCorrectionTime = 4.759754e-06 +PIC.Core.CS001HBA0.clockCorrectionTime = 4.759754e-06 +PIC.Core.CS001HBA1.clockCorrectionTime = 4.759754e-06 +PIC.Core.CS001HBA.clockCorrectionTime = 4.759754e-06 PIC.Core.CS011LBA.clockCorrectionTime = 7.55795e-06 PIC.Core.CS011HBA0.clockCorrectionTime = 7.55795e-06 @@ -52,80 +52,80 @@ PIC.Core.CS013HBA0.clockCorrectionTime = 1.639118e-05 PIC.Core.CS013HBA1.clockCorrectionTime = 1.639118e-05 PIC.Core.CS013HBA.clockCorrectionTime = 1.639118e-05 -PIC.Core.CS017LBA.clockCorrectionTime = 1.540943e-05 -PIC.Core.CS017HBA0.clockCorrectionTime = 1.540943e-05 -PIC.Core.CS017HBA1.clockCorrectionTime = 1.540943e-05 -PIC.Core.CS017HBA.clockCorrectionTime = 1.540943e-05 +PIC.Core.CS017LBA.clockCorrectionTime = 1.541095e-05 +PIC.Core.CS017HBA0.clockCorrectionTime = 1.541095e-05 +PIC.Core.CS017HBA1.clockCorrectionTime = 1.541095e-05 +PIC.Core.CS017HBA.clockCorrectionTime = 1.541095e-05 -PIC.Core.CS021LBA.clockCorrectionTime = 1.829614e-05 -PIC.Core.CS021HBA0.clockCorrectionTime = 1.829614e-05 -PIC.Core.CS021HBA1.clockCorrectionTime = 1.829614e-05 -PIC.Core.CS021HBA.clockCorrectionTime = 1.829614e-05 +PIC.Core.CS021LBA.clockCorrectionTime = 6.04963e-06 +PIC.Core.CS021HBA0.clockCorrectionTime = 6.04963e-06 +PIC.Core.CS021HBA1.clockCorrectionTime = 6.04963e-06 +PIC.Core.CS021HBA.clockCorrectionTime = 6.04963e-06 -PIC.Core.CS024LBA.clockCorrectionTime = 4.7597e-06 -PIC.Core.CS024HBA0.clockCorrectionTime = 4.7597e-06 -PIC.Core.CS024HBA1.clockCorrectionTime = 4.7597e-06 -PIC.Core.CS024HBA.clockCorrectionTime = 4.7597e-06 +PIC.Core.CS024LBA.clockCorrectionTime = 4.65818e-06 +PIC.Core.CS024HBA0.clockCorrectionTime = 4.65818e-06 +PIC.Core.CS024HBA1.clockCorrectionTime = 4.65818e-06 +PIC.Core.CS024HBA.clockCorrectionTime = 4.65818e-06 -PIC.Core.CS026LBA.clockCorrectionTime = 1.619986e-05 -PIC.Core.CS026HBA0.clockCorrectionTime = 1.619986e-05 -PIC.Core.CS026HBA1.clockCorrectionTime = 1.619986e-05 -PIC.Core.CS026HBA.clockCorrectionTime = 1.619986e-05 +PIC.Core.CS026LBA.clockCorrectionTime = 1.619876e-05 +PIC.Core.CS026HBA0.clockCorrectionTime = 1.619876e-05 +PIC.Core.CS026HBA1.clockCorrectionTime = 1.619876e-05 +PIC.Core.CS026HBA.clockCorrectionTime = 1.619876e-05 -PIC.Core.CS028LBA.clockCorrectionTime = 1.70614e-06 -PIC.Core.CS028HBA0.clockCorrectionTime = 1.70614e-06 -PIC.Core.CS028HBA1.clockCorrectionTime = 1.70614e-06 -PIC.Core.CS028HBA.clockCorrectionTime = 1.70614e-06 +PIC.Core.CS028LBA.clockCorrectionTime = 1.6962571e-05 +PIC.Core.CS028HBA0.clockCorrectionTime = 1.6962571e-05 +PIC.Core.CS028HBA1.clockCorrectionTime = 1.6962571e-05 +PIC.Core.CS028HBA.clockCorrectionTime = 1.6962571e-05 PIC.Core.CS030LBA.clockCorrectionTime = 9.7160576e-06 PIC.Core.CS030HBA0.clockCorrectionTime = 9.7160576e-06 PIC.Core.CS030HBA1.clockCorrectionTime = 9.7160576e-06 PIC.Core.CS030HBA.clockCorrectionTime = 9.7160576e-06 -PIC.Core.CS031LBA.clockCorrectionTime = 6.471391e-06 -PIC.Core.CS031HBA0.clockCorrectionTime = 6.471391e-06 -PIC.Core.CS031HBA1.clockCorrectionTime = 6.471391e-06 -PIC.Core.CS031HBA.clockCorrectionTime = 6.471391e-06 +PIC.Core.CS031LBA.clockCorrectionTime = 6.370090e-06 +PIC.Core.CS031HBA0.clockCorrectionTime = 6.370090e-06 +PIC.Core.CS031HBA1.clockCorrectionTime = 6.370090e-06 +PIC.Core.CS031HBA.clockCorrectionTime = 6.370090e-06 PIC.Core.CS032LBA.clockCorrectionTime = 8.546815e-06 PIC.Core.CS032HBA0.clockCorrectionTime = 8.546815e-06 PIC.Core.CS032HBA1.clockCorrectionTime = 8.546815e-06 PIC.Core.CS032HBA.clockCorrectionTime = 8.546815e-06 -PIC.Core.CS101LBA.clockCorrectionTime = 1.525453e-05 -PIC.Core.CS101HBA0.clockCorrectionTime = 1.525453e-05 -PIC.Core.CS101HBA1.clockCorrectionTime = 1.525453e-05 -PIC.Core.CS101HBA.clockCorrectionTime = 1.525453e-05 +PIC.Core.CS101LBA.clockCorrectionTime = 1.5165101e-05 +PIC.Core.CS101HBA0.clockCorrectionTime = 1.5165101e-05 +PIC.Core.CS101HBA1.clockCorrectionTime = 1.5165101e-05 +PIC.Core.CS101HBA.clockCorrectionTime = 1.5165101e-05 -PIC.Core.CS103LBA.clockCorrectionTime = 3.560284e-05 -PIC.Core.CS103HBA0.clockCorrectionTime = 3.560284e-05 -PIC.Core.CS103HBA1.clockCorrectionTime = 3.560284e-05 -PIC.Core.CS103HBA.clockCorrectionTime = 3.560284e-05 +PIC.Core.CS103LBA.clockCorrectionTime = 3.5500922e-05 +PIC.Core.CS103HBA0.clockCorrectionTime = 3.5500922e-05 +PIC.Core.CS103HBA1.clockCorrectionTime = 3.5500922e-05 +PIC.Core.CS103HBA.clockCorrectionTime = 3.5500922e-05 PIC.Core.CS201LBA.clockCorrectionTime = 1.744858e-05 PIC.Core.CS201HBA0.clockCorrectionTime = 1.744858e-05 PIC.Core.CS201HBA1.clockCorrectionTime = 1.744858e-05 PIC.Core.CS201HBA.clockCorrectionTime = 1.744858e-05 -PIC.Core.CS301LBA.clockCorrectionTime = 7.687733e-06 -PIC.Core.CS301HBA0.clockCorrectionTime = 7.687733e-06 -PIC.Core.CS301HBA1.clockCorrectionTime = 7.687733e-06 -PIC.Core.CS301HBA.clockCorrectionTime = 7.687733e-06 +PIC.Core.CS301LBA.clockCorrectionTime = 7.692091e-06 +PIC.Core.CS301HBA0.clockCorrectionTime = 7.692091e-06 +PIC.Core.CS301HBA1.clockCorrectionTime = 7.692091e-06 +PIC.Core.CS301HBA.clockCorrectionTime = 7.692091e-06 -PIC.Core.CS302LBA.clockCorrectionTime = 1.494415e-05 -PIC.Core.CS302HBA0.clockCorrectionTime = 1.494415e-05 -PIC.Core.CS302HBA1.clockCorrectionTime = 1.494415e-05 -PIC.Core.CS302HBA.clockCorrectionTime = 1.494415e-05 +PIC.Core.CS302LBA.clockCorrectionTime = 1.5062785e-05 +PIC.Core.CS302HBA0.clockCorrectionTime = 1.5062785e-05 +PIC.Core.CS302HBA1.clockCorrectionTime = 1.5062785e-05 +PIC.Core.CS302HBA.clockCorrectionTime = 1.5062785e-05 PIC.Core.CS401LBA.clockCorrectionTime = 8.051870e-06 PIC.Core.CS401HBA0.clockCorrectionTime = 8.057504e-06 PIC.Core.CS401HBA1.clockCorrectionTime = 8.057770e-06 PIC.Core.CS401HBA.clockCorrectionTime = 8.057637e-06 -PIC.Core.CS501LBA.clockCorrectionTime = 1.65833e-05 -PIC.Core.CS501HBA0.clockCorrectionTime = 1.65833e-05 -PIC.Core.CS501HBA1.clockCorrectionTime = 1.65833e-05 -PIC.Core.CS501HBA.clockCorrectionTime = 1.65833e-05 +PIC.Core.CS501LBA.clockCorrectionTime = 1.65842e-05 +PIC.Core.CS501HBA0.clockCorrectionTime = 1.65842e-05 +PIC.Core.CS501HBA1.clockCorrectionTime = 1.65842e-05 +PIC.Core.CS501HBA.clockCorrectionTime = 1.65842e-05 # # Stations outside of the core (no correction needed)