From 07ece7334dc2b70aeeb95f0843bab1f2e744c822 Mon Sep 17 00:00:00 2001
From: Jan David Mol <mol@astron.nl>
Date: Wed, 27 May 2009 12:58:43 +0000
Subject: [PATCH] bug 1303: update pencil beam calculations, and calculate
 delays at IONProc

---
 RTCP/CNProc/src/ArenaMapping.h                |   1 +
 RTCP/CNProc/src/AsyncCommunication.cc         |  50 +++--
 RTCP/CNProc/src/AsyncCommunication.h          |   1 -
 RTCP/CNProc/src/AsyncTranspose.cc             |  91 +++++---
 RTCP/CNProc/src/AsyncTranspose.h              |  22 +-
 RTCP/CNProc/src/BeamFormerAsm.h               |  29 +++
 RTCP/CNProc/src/CN_Processing.cc              |  28 ++-
 RTCP/CNProc/src/CN_Processing.h               |   1 +
 RTCP/CNProc/src/CN_Processing_main.cc         |   2 +-
 RTCP/CNProc/src/InputData.h                   |  21 +-
 RTCP/CNProc/src/PPF.cc                        |   9 +-
 RTCP/CNProc/src/PencilBeams.cc                | 195 ++++++------------
 RTCP/CNProc/src/PencilBeams.h                 |  19 +-
 RTCP/CNProc/src/TransposedData.h              |  18 +-
 RTCP/IONProc/src/InputSection.cc              |  73 ++++---
 RTCP/IONProc/src/InputSection.h               |  13 +-
 RTCP/IONProc/src/WH_DelayCompensation.cc      |  53 +++--
 RTCP/IONProc/src/WH_DelayCompensation.h       |   4 +-
 .../include/Interface/CN_Configuration.h      |  35 +---
 .../include/Interface/FilteredData.h          |  28 +--
 RTCP/Interface/include/Interface/Makefile.am  |   3 +-
 RTCP/Interface/include/Interface/Parset.h     |  23 ++-
 .../include/Interface/PencilCoordinates.h     |  14 +-
 .../include/Interface/PipelineOutput.h        |   2 +-
 .../include/Interface/SubbandMetaData.h       |  81 +++++++-
 RTCP/Interface/src/CN_Configuration.cc        |  24 +--
 RTCP/Interface/src/PencilCoordinates.cc       |   5 +-
 27 files changed, 467 insertions(+), 378 deletions(-)

diff --git a/RTCP/CNProc/src/ArenaMapping.h b/RTCP/CNProc/src/ArenaMapping.h
index b8b4dfc3726..ca3844220c3 100644
--- a/RTCP/CNProc/src/ArenaMapping.h
+++ b/RTCP/CNProc/src/ArenaMapping.h
@@ -28,6 +28,7 @@
 #include <Interface/Allocator.h>
 #include <Interface/StreamableData.h>
 #include <boost/noncopyable.hpp>
+#include <Common/LofarLogger.h>
 
 namespace LOFAR {
 namespace RTCP {
diff --git a/RTCP/CNProc/src/AsyncCommunication.cc b/RTCP/CNProc/src/AsyncCommunication.cc
index 7dad61c8758..ba9c75cc09f 100644
--- a/RTCP/CNProc/src/AsyncCommunication.cc
+++ b/RTCP/CNProc/src/AsyncCommunication.cc
@@ -4,6 +4,7 @@
 #include <AsyncCommunication.h>
 
 #include <Common/Timer.h>
+#include <Interface/Exceptions.h>
 
 #include <cassert>
 #include <map>
@@ -17,14 +18,10 @@ namespace RTCP {
 #if defined HAVE_MPI
 
 AsyncCommunication::AsyncCommunication(MPI_Comm comm)
-{
-    itsCommunicator = comm;
-    itsCurrentReadHandle = 0;
-    itsCurrentWriteHandle = 0;
-}
-
-
-AsyncCommunication::~AsyncCommunication()
+:
+  itsCommunicator(comm),
+  itsCurrentReadHandle(0),
+  itsCurrentWriteHandle(0)
 {
 }
 
@@ -35,8 +32,7 @@ int AsyncCommunication::asyncRead(void* buf, unsigned size, unsigned source, int
 
     int res = MPI_Irecv(buf, size, MPI_BYTE, source, tag, itsCommunicator, &req->mpiReq);
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Irecv() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Irecv() failed");
     }
 
     req->buf = buf;
@@ -57,8 +53,7 @@ int AsyncCommunication::asyncWrite(const void* buf, unsigned size, unsigned dest
 
     int res = MPI_Isend((void*)buf, size, MPI_BYTE, dest, tag, itsCommunicator, &req->mpiReq);
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Isend() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Isend() failed");
     }
 
     req->buf = (void*)buf;
@@ -80,8 +75,7 @@ void AsyncCommunication::waitForRead(int handle)
 
     int res = MPI_Wait(&req->mpiReq, &status);
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Wait() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Wait() failed");
     }
 
     // done, now remove from map, and free req
@@ -96,8 +90,7 @@ void AsyncCommunication::waitForWrite(int handle)
 
     int res = MPI_Wait(&req->mpiReq, &status);
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Wait() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Wait() failed");
     }
 
     // done, now remove from map, and free req
@@ -112,6 +105,8 @@ int AsyncCommunication::waitForAnyRead(void*& buf, unsigned& size, unsigned& sou
     int count = itsReadHandleMap.size();
     MPI_Request reqs[count];
     int mapping[count];
+
+    ASSERT( count > 0 );
     
     int i = 0;
     for (std::map<int, AsyncRequest*>::const_iterator it = itsReadHandleMap.begin(); it != itsReadHandleMap.end(); it++) {
@@ -129,8 +124,11 @@ int AsyncCommunication::waitForAnyRead(void*& buf, unsigned& size, unsigned& sou
     waitAnyTimer.stop();
 
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Waitany() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Waitany() failed");
+    }
+
+    if (index == MPI_UNDEFINED) {
+	THROW(CNProcException,"MPI_Waitany() failed: no (pending) receives");
     }
 
     int handle = mapping[index];
@@ -153,6 +151,11 @@ void AsyncCommunication::waitForAllReads()
     MPI_Request reqs[count];
     MPI_Status status[count];
 
+    if( count == 0 ) {
+      // nothing to wait for
+      return;
+    }
+
     int i = 0;
     for (std::map<int, AsyncRequest*>::const_iterator it = itsReadHandleMap.begin(); it != itsReadHandleMap.end(); it++) {
 	AsyncRequest* r = it->second;
@@ -162,8 +165,7 @@ void AsyncCommunication::waitForAllReads()
 
     int res = MPI_Waitall(count, reqs, status);
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Waitall() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Waitall() failed");
     }
 
     for (std::map<int, AsyncRequest*>::const_iterator it = itsReadHandleMap.begin(); it != itsReadHandleMap.end(); it++) {
@@ -181,6 +183,11 @@ void AsyncCommunication::waitForAllWrites()
     MPI_Request reqs[count];
     MPI_Status status[count];
 
+    if( count == 0 ) {
+      // nothing to wait for
+      return;
+    }
+
     int i = 0;
     for (std::map<int, AsyncRequest*>::const_iterator it = itsWriteHandleMap.begin(); it != itsWriteHandleMap.end(); it++) {
 	AsyncRequest* r = it->second;
@@ -190,8 +197,7 @@ void AsyncCommunication::waitForAllWrites()
 
     int res = MPI_Waitall(count, reqs, status);
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Waitall() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Waitall() failed");
     }
 
     for (std::map<int, AsyncRequest*>::const_iterator it = itsWriteHandleMap.begin(); it != itsWriteHandleMap.end(); it++) {
diff --git a/RTCP/CNProc/src/AsyncCommunication.h b/RTCP/CNProc/src/AsyncCommunication.h
index 6fd69d096e6..0434916e633 100644
--- a/RTCP/CNProc/src/AsyncCommunication.h
+++ b/RTCP/CNProc/src/AsyncCommunication.h
@@ -26,7 +26,6 @@ public:
 class AsyncCommunication: boost::noncopyable {
   public:
     AsyncCommunication(MPI_Comm communicator = MPI_COMM_WORLD);
-    ~AsyncCommunication();
 
     // returns handle to this read
     int asyncRead(void* buf, unsigned size, unsigned source, int tag);
diff --git a/RTCP/CNProc/src/AsyncTranspose.cc b/RTCP/CNProc/src/AsyncTranspose.cc
index 4f0e2e59043..635d3349633 100644
--- a/RTCP/CNProc/src/AsyncTranspose.cc
+++ b/RTCP/CNProc/src/AsyncTranspose.cc
@@ -5,6 +5,7 @@
 
 #include <Interface/CN_Mapping.h>
 #include <Interface/PrintVector.h>
+#include <Common/LofarLogger.h>
 
 #include <cassert>
 
@@ -14,53 +15,55 @@ namespace RTCP {
 
 #if defined HAVE_MPI
 
-#define MAX_TAG 100000 // The maximum tag we use to represent a data message. 
-                       // Higher tags are metadata.
+#define MAX_RANK 100000 // used for message identification: id = type*MAX_RANK + rank
 
 template <typename SAMPLE_TYPE> AsyncTranspose<SAMPLE_TYPE>::AsyncTranspose(
   const bool isTransposeInput, const bool isTransposeOutput, 
   const unsigned groupNumber, const LocationInfo &locationInfo, 
   const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets, 
-  const unsigned nrSamplesToCNProc, const unsigned nrSubbands, const unsigned nrSubbandsPerPset)
+  const unsigned nrSubbands, const unsigned nrSubbandsPerPset, const unsigned nrPencilBeams )
 :
   itsIsTransposeInput(isTransposeInput),
   itsIsTransposeOutput(isTransposeOutput),
   itsNrSubbands(nrSubbands),
   itsNrSubbandsPerPset(nrSubbandsPerPset),
+  itsNrPencilBeams(nrPencilBeams),
+  itsAsyncComm(),
   itsInputPsets(inputPsets),
   itsOutputPsets(outputPsets),
   itsLocationInfo(locationInfo),
+  itsCommHandles(itsNrCommunications,inputPsets.size()),
   itsGroupNumber(groupNumber)
 {
-  InputData<SAMPLE_TYPE> oneSample( 1, nrSamplesToCNProc );
-
   for(unsigned i=0; i<inputPsets.size(); i++) {
     const unsigned rank = locationInfo.remapOnTree(inputPsets[i], itsGroupNumber);
     itsRankToPsetIndex[rank] = i;
   }
-
-  itsMessageSize = oneSample.requiredSize();
-  dataHandles.resize(inputPsets.size());
-  metaDataHandles.resize(inputPsets.size());
-  itsAsyncComm = new AsyncCommunication();
 }
 
-
-template <typename SAMPLE_TYPE> AsyncTranspose<SAMPLE_TYPE>::~AsyncTranspose()
-{
-    delete itsAsyncComm;
-}
-
-
 template <typename SAMPLE_TYPE> void AsyncTranspose<SAMPLE_TYPE>::postAllReceives(TransposedData<SAMPLE_TYPE> *transposedData)
 {
+    // there must be something to receive
+    ASSERT( itsInputPsets.size() > 0 );
+
     for(unsigned i=0; i<itsInputPsets.size(); i++) {
-      void* buf = (void*) transposedData->samples[i].origin();
       unsigned pset = itsInputPsets[i];
       unsigned rank = itsLocationInfo.remapOnTree(pset, itsGroupNumber); // TODO cache this? maybe in locationInfo itself?
 
-      dataHandles[i] = itsAsyncComm->asyncRead(buf, itsMessageSize, rank, rank);
-      metaDataHandles[i] = itsAsyncComm->asyncRead(&transposedData->metaData[i], sizeof(SubbandMetaData), rank, rank + MAX_TAG);
+      // define what to read
+      struct {
+        void *ptr;
+        size_t size;
+      } toRead[itsNrCommunications] = {
+        { transposedData->samples[i].origin(), transposedData->samples[i].num_elements() * sizeof(SAMPLE_TYPE) },
+        { &transposedData->metaData[i].itsMarshalledData, sizeof(SubbandMetaData::marshallData) },
+        { &transposedData->metaData[i].beams.front(), itsNrPencilBeams * sizeof(SubbandMetaData::beamInfo) }
+      };
+
+      // read it
+      for( unsigned h = 0; h < itsNrCommunications; h++ ) {
+        itsCommHandles[h][i] = itsAsyncComm.asyncRead(toRead[h].ptr, toRead[h].size, rank, rank + h*MAX_RANK);
+      }
     }
 }
 
@@ -73,23 +76,33 @@ template <typename SAMPLE_TYPE> unsigned AsyncTranspose<SAMPLE_TYPE>::waitForAny
     unsigned size, source;
     int tag;
 
-    // This read could return either a data message, or a meta data message.
-    itsAsyncComm->waitForAnyRead(buf, size, source, tag);
+    // This read could return any type of message (out of itsCommHandles)
+    itsAsyncComm.waitForAnyRead(buf, size, source, tag);
 
     // source is the real rank, calc pset index
     const unsigned psetIndex = itsRankToPsetIndex[source];
 
-    if(tag < MAX_TAG) { // real data message
-      dataHandles[psetIndex] = -1; // record that we have received the data
-      if(metaDataHandles[psetIndex] == -1) { // We already have the metadata
-	return psetIndex;
+    // mark the right communication handle as received
+    for( unsigned h = 0; h < itsNrCommunications; h++ ) {
+      if( static_cast<unsigned>(tag) < (h+1) * MAX_RANK ) {
+        itsCommHandles[h][psetIndex] = -1;
+        break;
       }
-    } else { // metadata message
-      metaDataHandles[psetIndex] = -1; // record that we have received the metadata
-      if(dataHandles[psetIndex] == -1) {
-	return psetIndex; // We already have the data
+    }
+
+    // check whether we have received all communications for this psetIndex.
+    // This is the case when commHandles are -1.
+    bool haveAll = true;
+    for( unsigned h = 0; h < itsNrCommunications; h++ ) {
+      if( itsCommHandles[h][psetIndex] != -1 ) {
+        haveAll = false;
+        break;
       }
     }
+
+    if( haveAll ) {
+      return psetIndex;
+    }
   }
 }
 
@@ -101,15 +114,27 @@ template <typename SAMPLE_TYPE> void AsyncTranspose<SAMPLE_TYPE>::asyncSend(cons
   const unsigned rank = itsLocationInfo.remapOnTree(pset, itsGroupNumber);
   const int tag = itsLocationInfo.rank();
 
-  itsAsyncComm->asyncWrite(inputData->samples[outputPsetNr].origin(), itsMessageSize, rank, tag);
-  itsAsyncComm->asyncWrite(&inputData->metaData[outputPsetNr], sizeof(SubbandMetaData), rank, tag + MAX_TAG);
+  // define what to write
+  struct {
+    const void *ptr;
+    const size_t size;
+  } toWrite[itsNrCommunications] = {
+    { inputData->samples[outputPsetNr].origin(), inputData->samples[outputPsetNr].num_elements() * sizeof(SAMPLE_TYPE) },
+    { &inputData->metaData[outputPsetNr].itsMarshalledData, sizeof(SubbandMetaData::marshallData) },
+    { &inputData->metaData[outputPsetNr].beams.front(), itsNrPencilBeams * sizeof(SubbandMetaData::beamInfo) }
+  };
+
+  // write it
+  for( unsigned h = 0; h < itsNrCommunications; h++ ) {
+    itsAsyncComm.asyncWrite( toWrite[h].ptr, toWrite[h].size, rank, tag + h*MAX_RANK );
+  }
 }
 
 
 template <typename SAMPLE_TYPE> void AsyncTranspose<SAMPLE_TYPE>::waitForAllSends()
 {
   // this includes the metadata writes...
-  itsAsyncComm->waitForAllWrites();
+  itsAsyncComm.waitForAllWrites();
 }
 
   
diff --git a/RTCP/CNProc/src/AsyncTranspose.h b/RTCP/CNProc/src/AsyncTranspose.h
index 6563177b672..967ffa9a927 100644
--- a/RTCP/CNProc/src/AsyncTranspose.h
+++ b/RTCP/CNProc/src/AsyncTranspose.h
@@ -38,10 +38,8 @@ template <typename SAMPLE_TYPE> class AsyncTranspose
 
   AsyncTranspose(const bool isTransposeInput, const bool isTransposeOutput, 
 		 const unsigned groupNumber, const LocationInfo &, 
-		 const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets, const unsigned nrSamplesToCNProc, 
-		 const unsigned nrSubbands, const unsigned nrSubbandsPerPset);
-
-  ~AsyncTranspose();
+		 const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets, 
+		 const unsigned nrSubbands, const unsigned nrSubbandsPerPset, const unsigned nrPencilBeams );
   
   // Post all async receives for the transpose.
   void postAllReceives(TransposedData<SAMPLE_TYPE> *transposedData);
@@ -61,24 +59,24 @@ template <typename SAMPLE_TYPE> class AsyncTranspose
 
   unsigned itsNrSubbands;
   unsigned itsNrSubbandsPerPset;
+  unsigned itsNrPencilBeams;
 
-  // the size of a data message
-  unsigned itsMessageSize; 
-  
   // A mapping that tells us, if we receive a message from a source,
   // to which pset that source belongs.
   std::map<unsigned, unsigned> itsRankToPsetIndex; 
 
-  AsyncCommunication* itsAsyncComm;
+  AsyncCommunication itsAsyncComm;
   const std::vector<unsigned> &itsInputPsets;
-  const  std::vector<unsigned> &itsOutputPsets;
+  const std::vector<unsigned> &itsOutputPsets;
   const LocationInfo &itsLocationInfo;
 
-  // Two maps that contain the handles to the asynchronous reads.
+  // The number of communicates (writes/reads) needed to transport one sub band.
+  static const unsigned itsNrCommunications = 3;
+
+  // The maps that contain the handles to the asynchronous reads.
   // The maps are indexed by the inputPset index.
   // The value is -1 if the read finished.
-  std::vector<int> dataHandles;
-  std::vector<int> metaDataHandles;
+  Matrix<int> itsCommHandles; // [itsNrCommunications][itsNrInputPsets]
 
   // The number of the transpose group we belong to.
   // The cores with the same index in a pset together form a group.
diff --git a/RTCP/CNProc/src/BeamFormerAsm.h b/RTCP/CNProc/src/BeamFormerAsm.h
index 4cc368fddf2..32db2a4164c 100644
--- a/RTCP/CNProc/src/BeamFormerAsm.h
+++ b/RTCP/CNProc/src/BeamFormerAsm.h
@@ -1,5 +1,7 @@
 #if defined HAVE_BGP
 
+#include <cstring>
+
 namespace LOFAR {
 namespace RTCP {
 
@@ -129,6 +131,33 @@ void _beamform_6beams_2times(
 
 } // extern "C"
 
+// Similar functions that do not need or have an ASM version
+
+// defined just to aid the use of macros
+static inline void _add_1_single_precision_vectors(
+  float *dst,
+  const float *src1,
+  unsigned count /* non-zero; multiple of 16 */
+) {
+  // nothing to add, so just copy the values
+  memcpy( dst, src1, count * sizeof(float) );
+}
+
+static inline void _add_7_single_precision_vectors(
+  float *dst,
+  const float *src1,
+  const float *src2,
+  const float *src3,
+  const float *src4,
+  const float *src5,
+  const float *src6,
+  const float *src7,
+  unsigned count /* non-zero; multiple of 16 */
+) {
+  _add_4_single_precision_vectors( dst, src1, src2, src3, src4, count );
+  _add_4_single_precision_vectors( dst, dst,  src5, src6, src7, count );
+}
+
 } // namespace LOFAR::RTCP
 } // namespace LOFAR
 
diff --git a/RTCP/CNProc/src/CN_Processing.cc b/RTCP/CNProc/src/CN_Processing.cc
index 7e427775736..3e5e52131a9 100644
--- a/RTCP/CNProc/src/CN_Processing.cc
+++ b/RTCP/CNProc/src/CN_Processing.cc
@@ -264,6 +264,7 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
   itsIsTransposeOutput = outputPsetIndex != outputPsets.end();
 
   itsNrStations	                   = configuration.nrStations();
+  itsNrPencilBeams                 = configuration.nrPencilBeams();
   itsNrSubbands                    = configuration.nrSubbands();
   itsNrSubbandsPerPset             = configuration.nrSubbandsPerPset();
   itsMode                          = configuration.mode();
@@ -283,10 +284,6 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
   const unsigned nrBeamFormedStations = itsBeamFormer->getNrBeamFormedStations();
   const unsigned nrBaselines = nrBeamFormedStations * (nrBeamFormedStations + 1) / 2;
 
-  // include both the pencil rings and the manually defined pencil beam coordinates
-  PencilRings pencilCoordinates( configuration.nrPencilRings(), configuration.pencilRingSize() );
-  pencilCoordinates += PencilCoordinates( configuration.manualPencilBeams() );
-
   // Each phase (e.g., transpose, PPF, correlator) reads from an input data
   // set and writes to an output data set.  To save memory, two memory buffers
   // are used, and consecutive phases alternately use one of them as input
@@ -295,14 +292,14 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
   // Allocators for a single arena.
 
   if (itsIsTransposeInput) {
-    itsInputData = new InputData<SAMPLE_TYPE>(outputPsets.size(), nrSamplesToCNProc);
+    itsInputData = new InputData<SAMPLE_TYPE>(outputPsets.size(), nrSamplesToCNProc, itsNrPencilBeams);
   }
 
   if (itsIsTransposeOutput) {
     // create only the data structures that are used by the pipeline
 
-    itsTransposedData = new TransposedData<SAMPLE_TYPE>(itsNrStations, nrSamplesToCNProc);
-    itsFilteredData   = new FilteredData(itsNrStations, nrChannels, nrSamplesPerIntegration);
+    itsTransposedData = new TransposedData<SAMPLE_TYPE>(itsNrStations, nrSamplesToCNProc, itsNrPencilBeams);
+    itsFilteredData   = new FilteredData(itsNrStations, nrChannels, nrSamplesPerIntegration, itsNrPencilBeams);
 
     switch( itsMode.mode() ) {
       case CN_Mode::FILTER:
@@ -314,17 +311,17 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
         break;
 
       case CN_Mode::COHERENT_COMPLEX_VOLTAGES:
-        itsPencilBeamData = new PencilBeamData(pencilCoordinates.size(), nrChannels, nrSamplesPerIntegration);
+        itsPencilBeamData = new PencilBeamData(itsNrPencilBeams, nrChannels, nrSamplesPerIntegration);
         break;
 
       case CN_Mode::COHERENT_STOKES_I:
       case CN_Mode::COHERENT_ALLSTOKES:
-        itsPencilBeamData = new PencilBeamData(pencilCoordinates.size(), nrChannels, nrSamplesPerIntegration);
+        itsPencilBeamData = new PencilBeamData(itsNrPencilBeams, nrChannels, nrSamplesPerIntegration);
         // fallthrough
 
       case CN_Mode::INCOHERENT_STOKES_I:
       case CN_Mode::INCOHERENT_ALLSTOKES:
-        itsStokesData     = new StokesData(itsMode.isCoherent(), itsMode.nrStokes(), pencilCoordinates.size(), nrChannels, nrSamplesPerIntegration, nrSamplesPerStokesIntegration);
+        itsStokesData     = new StokesData(itsMode.isCoherent(), itsMode.nrStokes(), itsNrPencilBeams, nrChannels, nrSamplesPerIntegration, nrSamplesPerStokesIntegration);
         break;
 
       default:
@@ -337,7 +334,7 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
     }
 
     if( itsStokesIntegrateChannels ) {
-      itsStokesDataIntegratedChannels = new StokesDataIntegratedChannels(itsMode.isCoherent(), itsMode.nrStokes(), pencilCoordinates.size(), nrSamplesPerIntegration, nrSamplesPerStokesIntegration);
+      itsStokesDataIntegratedChannels = new StokesDataIntegratedChannels(itsMode.isCoherent(), itsMode.nrStokes(), itsNrPencilBeams, nrSamplesPerIntegration, nrSamplesPerStokesIntegration);
     }
   }
 
@@ -359,7 +356,6 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
   // create the arenas and allocate the data sets
   itsMapping.allocate();
 
-  
   if (itsIsTransposeOutput) {
     const unsigned logicalNode	= usedCoresPerPset * (outputPsetIndex - outputPsets.begin()) + myCore;
     // TODO: logicalNode assumes output psets are consecutively numbered
@@ -376,7 +372,7 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
 
     itsPPF	      = new PPF<SAMPLE_TYPE>(itsNrStations, nrChannels, nrSamplesPerIntegration, configuration.sampleRate() / nrChannels, configuration.delayCompensation(), itsLocationInfo.rank() == 0);
 
-    itsPencilBeamFormer  = new PencilBeams(pencilCoordinates, itsNrStations, nrChannels, nrSamplesPerIntegration, itsCenterFrequencies[itsCurrentSubband], configuration.sampleRate() / nrChannels, configuration.refPhaseCentre(), configuration.phaseCentres() );
+    itsPencilBeamFormer  = new PencilBeams(itsNrPencilBeams, itsNrStations, nrChannels, nrSamplesPerIntegration, itsCenterFrequencies[itsCurrentSubband], configuration.sampleRate() / nrChannels );
     itsStokes            = new Stokes(itsMode.isCoherent(), itsMode.nrStokes(), nrChannels, nrSamplesPerIntegration, nrSamplesPerStokesIntegration );
     itsIncoherentStokesI = new Stokes(false, 1, nrChannels, nrSamplesPerIntegration, nrSamplesPerStokesIntegration );
 
@@ -388,7 +384,7 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
   if (itsIsTransposeInput || itsIsTransposeOutput) {
     itsAsyncTranspose = new AsyncTranspose<SAMPLE_TYPE>(itsIsTransposeInput, itsIsTransposeOutput, 
 							myCore, itsLocationInfo, inputPsets, outputPsets, 
-							nrSamplesToCNProc, itsNrSubbands, itsNrSubbandsPerPset);
+							itsNrSubbands, itsNrSubbandsPerPset, itsNrPencilBeams);
   }
 #endif // HAVE_MPI
 }
@@ -529,7 +525,7 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::calculateCohere
     LOG_DEBUG(std::setprecision(12) << "core " << itsLocationInfo.rank() << ": start calculating coherent Stokes at " << MPI_Wtime());
 #endif // HAVE_MPI
   computeTimer.start();
-  itsStokes->calculateCoherent(itsPencilBeamData,itsStokesData,itsPencilBeamFormer->nrCoordinates());
+  itsStokes->calculateCoherent(itsPencilBeamData,itsStokesData,itsNrPencilBeams);
   computeTimer.stop();
 }
 
@@ -609,7 +605,7 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::process()
         formPencilBeams();
         calculateCoherentStokes();
 	if( itsStokesIntegrateChannels ) {
-	  itsStokes->compressStokes( itsStokesData, itsStokesDataIntegratedChannels, itsPencilBeamFormer->nrCoordinates() );
+	  itsStokes->compressStokes( itsStokesData, itsStokesDataIntegratedChannels, itsNrPencilBeams );
           sendOutput( itsStokesDataIntegratedChannels );
 	} else {
           sendOutput( itsStokesData );
diff --git a/RTCP/CNProc/src/CN_Processing.h b/RTCP/CNProc/src/CN_Processing.h
index d7f5487eeb4..b686e639ede 100644
--- a/RTCP/CNProc/src/CN_Processing.h
+++ b/RTCP/CNProc/src/CN_Processing.h
@@ -117,6 +117,7 @@ template <typename SAMPLE_TYPE> class CN_Processing : public CN_Processing_Base,
 #endif
 
     unsigned            itsNrStations;
+    unsigned            itsNrPencilBeams;
     unsigned            itsNrSubbands;
     unsigned            itsNrSubbandsPerPset;
     unsigned            itsComputeGroupRank;
diff --git a/RTCP/CNProc/src/CN_Processing_main.cc b/RTCP/CNProc/src/CN_Processing_main.cc
index ec84d9ea110..a01f4a5f650 100644
--- a/RTCP/CNProc/src/CN_Processing_main.cc
+++ b/RTCP/CNProc/src/CN_Processing_main.cc
@@ -48,7 +48,7 @@
 
 // if exceptions are not caught, an attempt is made to create a backtrace
 // from the place where the exception is thrown.
-#undef CATCH_EXCEPTIONS
+#define CATCH_EXCEPTIONS
 
 
 using namespace LOFAR;
diff --git a/RTCP/CNProc/src/InputData.h b/RTCP/CNProc/src/InputData.h
index 900886dae08..27e158178d2 100644
--- a/RTCP/CNProc/src/InputData.h
+++ b/RTCP/CNProc/src/InputData.h
@@ -23,7 +23,7 @@ template <typename SAMPLE_TYPE> class InputData: public SampleData<SAMPLE_TYPE,3
   public:
     typedef SampleData<SAMPLE_TYPE,3> SuperType;
 
-    InputData(const unsigned nrSubbands, const unsigned nrSamplesToCNProc);
+    InputData(const unsigned nrSubbands, const unsigned nrSamplesToCNProc, const unsigned nrPencilBeams);
 
     virtual void allocate( Allocator &allocator = heapAllocator );
 
@@ -36,31 +36,38 @@ template <typename SAMPLE_TYPE> class InputData: public SampleData<SAMPLE_TYPE,3
 
   private:
     const unsigned	    itsNrSubbands;
+    const unsigned	    itsNrPencilBeams;
 
   public:
     Vector<SubbandMetaData> metaData; //[outputPsets.size()]
 };
 
 
-template <typename SAMPLE_TYPE> inline InputData<SAMPLE_TYPE>::InputData(const unsigned nrSubbands, const unsigned nrSamplesToCNProc)
+template <typename SAMPLE_TYPE> inline InputData<SAMPLE_TYPE>::InputData(const unsigned nrSubbands, const unsigned nrSamplesToCNProc, const unsigned nrPencilBeams)
 :
   SuperType( false, boost::extents[nrSubbands][nrSamplesToCNProc][NR_POLARIZATIONS], 0 ),
-  itsNrSubbands(nrSubbands)
+  itsNrSubbands(nrSubbands),
+  itsNrPencilBeams(nrPencilBeams)
 {
 }
 
 template <typename SAMPLE_TYPE> inline void InputData<SAMPLE_TYPE>::allocate( Allocator &allocator )
 {
   SuperType::allocate( allocator );
-  metaData.resize( itsNrSubbands );
-}
+  metaData.resize( itsNrSubbands, 32 );
 
+  for( unsigned i = 0; i < itsNrSubbands; i++ ) {
+    metaData[i] = SubbandMetaData( itsNrPencilBeams );
+  }
+}
 
 // used for asynchronous transpose
 template <typename SAMPLE_TYPE> inline void InputData<SAMPLE_TYPE>::readMetaData(Stream *str)
 {
   // read all metadata
-  str->read(&metaData[0], metaData.size() * sizeof(SubbandMetaData));
+  for( unsigned subband = 0; subband < itsNrSubbands; subband++ ) {
+    metaData[subband].read( str );
+  }
 }
 
 // used for asynchronous transpose
@@ -76,7 +83,7 @@ template <typename SAMPLE_TYPE> inline void InputData<SAMPLE_TYPE>::readOne(Stre
 template <typename SAMPLE_TYPE> inline void InputData<SAMPLE_TYPE>::readAll(Stream *str)
 {
   // read all metadata
-  str->read(&metaData[0], metaData.size() * sizeof(SubbandMetaData));
+  readMetaData( str );
 
   // now read all subbands using one recvBlocking call, even though the ION
   // sends all subbands one at a time
diff --git a/RTCP/CNProc/src/PPF.cc b/RTCP/CNProc/src/PPF.cc
index eb6a135c153..d41f4cd6a44 100644
--- a/RTCP/CNProc/src/PPF.cc
+++ b/RTCP/CNProc/src/PPF.cc
@@ -262,7 +262,7 @@ template <typename SAMPLE_TYPE> void PPF<SAMPLE_TYPE>::filter(unsigned stat, dou
   _bgl_mutex_lock(mutex);
 #endif
 
-  unsigned alignmentShift = transposedData->metaData[stat].alignmentShift;
+  const unsigned alignmentShift = transposedData->metaData[stat].alignmentShift();
 
 #if 0
   LOG_DEBUG_STR(setprecision(15) << "stat " << stat << ", basefreq " << baseFrequency << ": delay from " << delays[stat].delayAtBegin << " to " << delays[stat].delayAfterEnd << " sec");
@@ -348,12 +348,11 @@ template <typename SAMPLE_TYPE> void PPF<SAMPLE_TYPE>::filter(unsigned stat, dou
 
   struct phase_shift phaseShifts[itsNrSamplesPerIntegration];
 
-  computePhaseShifts(phaseShifts, transposedData->metaData[stat].delayAtBegin, transposedData->metaData[stat].delayAfterEnd, baseFrequency);
+  computePhaseShifts(phaseShifts, transposedData->metaData[stat].beams[0].delayAtBegin, transposedData->metaData[stat].beams[0].delayAfterEnd, baseFrequency);
 
   // forward (pencil) beam forming information
-  for( unsigned i = 0; i < 3; i++ ) {
-    filteredData->metaData[stat].beamDirectionAtBegin[i] = transposedData->metaData[stat].beamDirectionAtBegin[i];
-    filteredData->metaData[stat].beamDirectionAfterEnd[i] = transposedData->metaData[stat].beamDirectionAfterEnd[i];
+  for( unsigned p = 0; p < filteredData->metaData[stat].beams.size(); p++ ) {
+    filteredData->metaData[stat].beams[p] = transposedData->metaData[stat].beams[p];
   }
 
   const SparseSet<unsigned>::Ranges &ranges = filteredData->flags[stat].getRanges();
diff --git a/RTCP/CNProc/src/PencilBeams.cc b/RTCP/CNProc/src/PencilBeams.cc
index 38e6be799ff..e2427185ac5 100644
--- a/RTCP/CNProc/src/PencilBeams.cc
+++ b/RTCP/CNProc/src/PencilBeams.cc
@@ -6,11 +6,14 @@
 #include <Interface/MultiDimArray.h>
 #include <Interface/PrintVector.h>
 #include <Interface/Exceptions.h>
+#include <Interface/SubbandMetaData.h>
 #include <Common/Timer.h>
+#include <Common/LofarLogger.h>
 #include <iostream>
 #include <cmath>
 #include <cassert>
 #include <vector>
+#include <cstring>
 
 #ifndef PENCILBEAMS_C_IMPLEMENTATION
   #include <BeamFormerAsm.h>
@@ -25,63 +28,17 @@ namespace RTCP {
 
 static NSTimer pencilBeamFormTimer("PencilBeamFormer::formBeams()", true, true);
 
-PencilBeams::PencilBeams(PencilCoordinates &coordinates, const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration, const double centerFrequency, const double channelBandwidth, const std::vector<double> &refPhaseCentre, const Matrix<double> &phaseCentres )
+PencilBeams::PencilBeams(const unsigned nrPencilBeams, const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration, const double centerFrequency, const double channelBandwidth )
 :
-  itsCoordinates(coordinates.getCoordinates()),
   itsNrStations(nrStations),
+  itsNrPencilBeams(nrPencilBeams),
   itsNrChannels(nrChannels),
   itsNrSamplesPerIntegration(nrSamplesPerIntegration),
   itsCenterFrequency(centerFrequency),
   itsChannelBandwidth(channelBandwidth),
   itsBaseFrequency(centerFrequency - (nrChannels/2) * channelBandwidth),
-  itsRefPhaseCentre(refPhaseCentre)
+  itsDelays( nrStations, nrPencilBeams )
 {
-  // copy all phase centres and their derived constants
-  itsPhaseCentres.reserve( nrStations );
-  itsBaselines.reserve( nrStations );
-  itsBaselinesSeconds.reserve( nrStations );
-  itsDelays.resize( nrStations, itsCoordinates.size() );
-  itsDelayOffsets.resize( nrStations, itsCoordinates.size() );
-  for( unsigned stat = 0; stat < nrStations; stat++ ) {
-     const double x = phaseCentres[stat][0];
-     const double y = phaseCentres[stat][1];
-     const double z = phaseCentres[stat][2];
-
-     PencilCoord3D phaseCentre( x, y, z );
-     PencilCoord3D baseLine = phaseCentre - refPhaseCentre;
-     PencilCoord3D baseLineSeconds = baseLine * (1.0/speedOfLight);
-
-     itsPhaseCentres.push_back( phaseCentre );
-     itsBaselines.push_back( baseLine );
-     itsBaselinesSeconds.push_back( baseLineSeconds );
-
-     for( unsigned beam = 0; beam < itsCoordinates.size(); beam++ ) {
-       itsDelayOffsets[stat][beam] = baseLine * itsCoordinates[beam] * (1.0/speedOfLight);
-     }
-  }
-}
-
-void PencilBeams::calculateDelays( unsigned stat, const PencilCoord3D &beamDir )
-{
-  const double compensatedDelay = itsDelayOffsets[stat][0] - itsBaselinesSeconds[stat] * beamDir;
-
-  //LOG_DEBUG_STR("station " << stat << " beam 0 has an absolute delay of  " << compensatedDelay);
-
-  // centre beam does not need compensation
-  itsDelays[stat][0] = 0.0;
-
-  for( unsigned i = 1; i < itsCoordinates.size(); i++ ) {
-     // delay[i] = baseline * (coordinate - beamdir) / c
-     //          = (baseline * coordinate / c) - (baseline * beamdir) / c
-     //          = delayoffset - baselinesec * beamdir
-     //
-     // further reduced by the delay we already compensate for when doing regular beam forming (the centre of the beam). that compensation is done at the IONode (sample shift) and the PPF (phase shift)
-     itsDelays[stat][i] = itsDelayOffsets[stat][i] - itsBaselinesSeconds[stat] * beamDir - compensatedDelay;
-
-     //LOG_DEBUG_STR("station " << stat << " beam " << i << "has an additional delay of " << itsDelays[stat][i]);
-     //LOG_DEBUG_STR(itsDelayOffsets[stat][i] << " " << itsBaselinesSeconds[stat] << " " << beamDir << " " << compensatedDelay);
-     //LOG_DEBUG_STR("example shift: " << phaseShift( itsBaseFrequency + itsNrChannels/2*itsChannelBandwidth, itsDelays[stat][i] ));
-  }
 }
 
 #ifdef PENCILBEAMS_C_IMPLEMENTATION
@@ -148,18 +105,20 @@ void PencilBeams::computeComplexVoltages( const FilteredData *in, PencilBeamData
 #else // ASM implementation
 
 // what we can process in one go
-static const unsigned NRSTATIONS = 3;
-static const unsigned NRBEAMS = 6;
+static const unsigned NRSTATIONS = 6;
+static const unsigned NRBEAMS = 3;
 #define BEAMFORMFUNC _beamform_up_to_6_stations_and_3_beams
+#define ADDFUNC(nr)  _add_ ## nr ## _single_precision_vectors
 
 // the number of samples to do in one go, such that the
 // caches are optimally used. itsNrSamplesPerIntegration needs
-// to be a multiple of this
+// to be a multiple of this.
+// Also, TIMESTEPSIZE needs to be a multiple of 16, as the assembly code requires it
 static const unsigned TIMESTEPSIZE = 128;
 
 void PencilBeams::computeComplexVoltages( const FilteredData *in, PencilBeamData *out )
 {
-  const unsigned nrBeams = itsCoordinates.size();
+  ASSERT( TIMESTEPSIZE % 16 == 0 );
 
   if( itsNrSamplesPerIntegration % TIMESTEPSIZE > 0 ) {
     THROW(CNProcException, "nrSamplesPerIntegration (" << itsNrSamplesPerIntegration << ") needs to be a multiple of " << TIMESTEPSIZE );
@@ -187,7 +146,7 @@ void PencilBeams::computeComplexVoltages( const FilteredData *in, PencilBeamData
   }
 
   // conservative flagging: flag output if any input was flagged 
-  for( unsigned beam = 0; beam < nrBeams; beam++ ) {
+  for( unsigned beam = 0; beam < itsNrPencilBeams; beam++ ) {
     out->flags[beam].reset();
 
     for (unsigned stat = 0; stat < itsNrStations; stat++ ) {
@@ -205,23 +164,21 @@ void PencilBeams::computeComplexVoltages( const FilteredData *in, PencilBeamData
     const float factor = averagingFactor; // add multiplication factors as needed
 
     // construct the weights, with zeroes for unused data
-    fcomplex weights[itsNrStations][nrBeams] __attribute__ ((aligned(32)));
+    fcomplex weights[itsNrStations][itsNrPencilBeams] __attribute__ ((aligned(128)));
 
     for( unsigned s = 0; s < itsNrStations; s++ ) {
       if( validStation[s] ) {
-        for( unsigned b = 0; b < nrBeams; b++ ) {
+        for( unsigned b = 0; b < itsNrPencilBeams; b++ ) {
           weights[s][b] = phaseShift( frequency, itsDelays[s][b] ) * factor;
         }
       } else {
         // a dropped station has a weight of 0
-        for( unsigned b = 0; b < nrBeams; b++ ) {
+        for( unsigned b = 0; b < itsNrPencilBeams; b++ ) {
           weights[s][b] = makefcomplex( 0, 0 );
         }
       }
     }
 
-    // TODO: separate construction of central beam
-
     unsigned processBeams = NRBEAMS;
     unsigned processStations = NRSTATIONS;
 
@@ -233,104 +190,69 @@ void PencilBeams::computeComplexVoltages( const FilteredData *in, PencilBeamData
 
       for( unsigned time = 0; time < itsNrSamplesPerIntegration; time += TIMESTEPSIZE ) {
         // central beam (#0) has no weights, we can simply add the stations
+
 	switch( processStations ) {
 	  case 0:
 	  default:
 	    THROW(CNProcException,"Requested to add " << processStations << " stations, but can only add 1-6.");
 	    break;
 
+// possible inputs
+#define OUTPUT		(reinterpret_cast<float*>(out->samples[ch][0][time].origin()))
+#define STATION(nr)	(reinterpret_cast<const float*>(in->samples[ch][stat+nr][time].origin()))
+
+// shorthand for the add functions
+#define ADDGENERIC(nr,...)	ADDFUNC(nr)(						\
+	        OUTPUT,									\
+		__VA_ARGS__,								\
+	        TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */ )
+
+// adds stations, and the subtotal if needed (if stat!=0)
+#define ADD(nr,nrplusone,...)	do {							\
+				  if( stat ) {						\
+				    ADDGENERIC(nrplusone,OUTPUT,__VA_ARGS__);		\
+				  } else {						\
+				    ADDGENERIC(nr,__VA_ARGS__);				\
+				  }							\
+				} while(0);
+
 	  case 1:
-	    // nothing to add, so just copy the values
-	    memcpy( out->samples[ch][0][time].origin(), in->samples[ch][stat+0][time].origin(),
-              TIMESTEPSIZE * NR_POLARIZATIONS * sizeof out->samples[0][0][0][0] );
+            ADD( 1, 2, STATION(0) );
 	    break;
 
 	  case 2:
-	    _add_2_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+0][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+1][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
+            ADD( 2, 3, STATION(0), STATION(1) );
 	    break;
 
 	  case 3:
-	    _add_3_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+0][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+1][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+2][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
+            ADD( 3, 4, STATION(0), STATION(1), STATION(2) );
 	    break;
 
 	  case 4:
-	    _add_4_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+0][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+1][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+2][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+3][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
+            ADD( 4, 5, STATION(0), STATION(1), STATION(2), STATION(3) );
 	    break;
 
 	  case 5:
-	    _add_4_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+0][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+1][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+2][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+3][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
-
-	    _add_2_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-	      reinterpret_cast<const float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+4][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
+            ADD( 5, 6, STATION(0), STATION(1), STATION(2), STATION(3), STATION(4) );
 	    break;
-#if 0
-	  case 5:
-	    _add_5_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+0][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+1][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+2][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+3][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+4][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
-	    break;
-#endif
+
 	  case 6:
-	    _add_6_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+0][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+1][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+2][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+3][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+4][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+5][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
+            ADD( 6, 7, STATION(0), STATION(1), STATION(2), STATION(3), STATION(4), STATION(5) );
 	    break;
 	}
 
 	// non-central beams
-        for( unsigned beam = 1; beam < nrBeams; beam += processBeams ) {
-          processBeams = std::min( NRBEAMS, nrBeams - beam ); 
+        for( unsigned beam = 1; beam < itsNrPencilBeams; beam += processBeams ) {
+          processBeams = std::min( NRBEAMS, itsNrPencilBeams - beam ); 
 
           // beam form
 	  // note: PPF.cc puts zeroes at flagged samples, so we can just add them
           BEAMFORMFUNC(
             out->samples[ch][beam][time].origin(),
-            out->samples[ch][beam].strides()[0] * sizeof out->samples[0][0][0][0],
+            out->samples[ch].strides()[0] * sizeof out->samples[0][0][0][0],
 
             in->samples[ch][stat][time].origin(),
-            in->samples[ch][stat].strides()[0] * sizeof in->samples[0][0][0][0],
+            in->samples[ch].strides()[0] * sizeof in->samples[0][0][0][0],
 
             &weights[stat][beam],
             (&weights[1][0] - &weights[0][0]) * sizeof weights[0][0],
@@ -350,16 +272,23 @@ void PencilBeams::computeComplexVoltages( const FilteredData *in, PencilBeamData
 
 #endif
 
-void PencilBeams::calculateAllDelays( const FilteredData *filteredData )
+void PencilBeams::calculateDelays( const FilteredData *filteredData )
 {
-  // calculate the delays for each station for this integration period
+  // Calculate the delays for each station for this integration period.
+
+  // We assume that the delay compensation has already occurred for the central beam. Also,
+  // we use the same delay for all time samples. This could be interpolated for TIMESTEPSIZE
+  // portions, as used in computeComplexVoltages.
+
   for( unsigned stat = 0; stat < itsNrStations; stat++ ) {
-    // todo: interpolate per time step?
-    PencilCoord3D beamDirBegin = filteredData->metaData[stat].beamDirectionAtBegin;
-    PencilCoord3D beamDirEnd = filteredData->metaData[stat].beamDirectionAfterEnd;
-    PencilCoord3D beamDirAverage = (beamDirBegin + beamDirEnd) * 0.5;
+    for( unsigned pencil = 0; pencil < itsNrPencilBeams; pencil++ ) {
+      const SubbandMetaData::beamInfo &beamInfo = filteredData->metaData[stat].beams[pencil];
 
-    calculateDelays( stat, beamDirAverage );
+      itsDelays[stat][pencil] = (beamInfo.delayAfterEnd - beamInfo.delayAtBegin) * 0.5;
+
+      // subtract the delay that was already compensated for (i.e. the central beam)
+      itsDelays[stat][pencil] -= itsDelays[stat][0];
+    }
   }
 }
 
@@ -370,7 +299,7 @@ void PencilBeams::formPencilBeams( const FilteredData *filteredData, PencilBeamD
 
   pencilBeamFormTimer.start();
 
-  calculateAllDelays( filteredData );
+  calculateDelays( filteredData );
   computeComplexVoltages( filteredData, pencilBeamData );
 
   pencilBeamFormTimer.stop();
diff --git a/RTCP/CNProc/src/PencilBeams.h b/RTCP/CNProc/src/PencilBeams.h
index d6ef72bc009..fd7eed8dcfa 100644
--- a/RTCP/CNProc/src/PencilBeams.h
+++ b/RTCP/CNProc/src/PencilBeams.h
@@ -16,40 +16,33 @@
 namespace LOFAR {
 namespace RTCP {
 
+const double speedOfLight = 299792458;
 
 class PencilBeams
 {
   public:
-    static const double speedOfLight = 299792458;
     static const float MAX_FLAGGED_PERCENTAGE = 0.9f;
 
-    PencilBeams(PencilCoordinates &coordinates, const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration, const double centerFrequency, const double channelBandwidth, const std::vector<double> &refPhaseCentre, const Matrix<double> &phaseCentres );
+    PencilBeams(const unsigned nrPencilBeams, const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration, const double centerFrequency, const double channelBandwidth );
 
     void formPencilBeams( const FilteredData *filteredData, PencilBeamData *pencilBeamData );
 
-    size_t nrCoordinates() const { return itsCoordinates.size(); }
-
   private:
+    void calculateDelays( const FilteredData *filteredData );
+
     fcomplex phaseShift( const float frequency, const float delay ) const;
     void computeBeams( const MultiDimArray<fcomplex,4> &in, MultiDimArray<fcomplex,4> &out, const std::vector<unsigned> stations );
-    void calculateDelays( const unsigned stat, const PencilCoord3D &beamDir );
-    void calculateAllDelays( const FilteredData *filteredData );
 
     void computeComplexVoltages( const FilteredData *filteredData, PencilBeamData *pencilBeamData );
 
-    std::vector<PencilCoord3D> itsCoordinates;
     const unsigned          itsNrStations;
+    const unsigned          itsNrPencilBeams;
     const unsigned          itsNrChannels;
     const unsigned          itsNrSamplesPerIntegration;
     const double            itsCenterFrequency;
     const double            itsChannelBandwidth;
     const double            itsBaseFrequency;
-    Matrix<double>          itsDelays; // [itsNrStations][itsCoordinates.size()]
-    Matrix<double>          itsDelayOffsets; // [itsNrStations][itsCoordinates.size()]
-    PencilCoord3D           itsRefPhaseCentre;
-    std::vector<PencilCoord3D> itsPhaseCentres;
-    std::vector<PencilCoord3D> itsBaselines;
-    std::vector<PencilCoord3D> itsBaselinesSeconds;
+    Matrix<double>          itsDelays; // [itsNrStations][itsNrPencilBeams]
 };
 
 inline fcomplex PencilBeams::phaseShift( const float frequency, const float delay ) const
diff --git a/RTCP/CNProc/src/TransposedData.h b/RTCP/CNProc/src/TransposedData.h
index 8aad180da8f..dccea444c42 100644
--- a/RTCP/CNProc/src/TransposedData.h
+++ b/RTCP/CNProc/src/TransposedData.h
@@ -3,6 +3,7 @@
 
 #include <Common/lofar_complex.h>
 #include <Interface/Align.h>
+#include <Common/LofarLogger.h>
 #include <Interface/Config.h>
 #include <Interface/MultiDimArray.h>
 #include <Interface/StreamableData.h>
@@ -19,22 +20,23 @@ template <typename SAMPLE_TYPE> class TransposedData: public SampleData<SAMPLE_T
   public:
     typedef SampleData<SAMPLE_TYPE,3> SuperType;
 
-    TransposedData(const unsigned nrStations, const unsigned nrSamplesToCNProc);
+    TransposedData(const unsigned nrStations, const unsigned nrSamplesToCNProc, const unsigned nrPencilBeams);
 
-    virtual size_t requiredSize() const;
     virtual void allocate( Allocator &allocator = heapAllocator );
 
     Vector<SubbandMetaData> metaData; //[itsNrStations]
   private:
     const unsigned		itsNrStations;
+    const unsigned		itsNrPencilBeams;
     const unsigned		itsNrSamplesToCNProc;
 };
 
 
-template <typename SAMPLE_TYPE> inline TransposedData<SAMPLE_TYPE>::TransposedData(const unsigned nrStations, const unsigned nrSamplesToCNProc)
+template <typename SAMPLE_TYPE> inline TransposedData<SAMPLE_TYPE>::TransposedData(const unsigned nrStations, const unsigned nrSamplesToCNProc, const unsigned nrPencilBeams)
 :
   SuperType(false,boost::extents[nrStations][nrSamplesToCNProc][NR_POLARIZATIONS],0),
   itsNrStations(nrStations),
+  itsNrPencilBeams(nrPencilBeams),
   itsNrSamplesToCNProc(nrSamplesToCNProc)
 {
 }
@@ -42,13 +44,11 @@ template <typename SAMPLE_TYPE> inline TransposedData<SAMPLE_TYPE>::TransposedDa
 template <typename SAMPLE_TYPE> inline void TransposedData<SAMPLE_TYPE>::allocate( Allocator &allocator )
 {
   SuperType::allocate( allocator );
-  metaData.resize(itsNrStations, 32, allocator);
-}
+  metaData.resize(itsNrStations, 32);
 
-template <typename SAMPLE_TYPE> inline size_t TransposedData<SAMPLE_TYPE>::requiredSize() const
-{
-  return SuperType::requiredSize() +
-	 align(sizeof(SubbandMetaData) * itsNrStations, 32);
+  for( unsigned i = 0; i < itsNrStations; i++ ) {
+    metaData[i] = SubbandMetaData( itsNrPencilBeams );
+  }
 }
 
 } // namespace RTCP
diff --git a/RTCP/IONProc/src/InputSection.cc b/RTCP/IONProc/src/InputSection.cc
index 22730641177..d6df2142d55 100644
--- a/RTCP/IONProc/src/InputSection.cc
+++ b/RTCP/IONProc/src/InputSection.cc
@@ -134,6 +134,7 @@ template<typename SAMPLE_TYPE> void InputSection<SAMPLE_TYPE>::preprocess(const
   itsNSubbandsPerPset	= ps->nrSubbandsPerPset();
   itsNSamplesPerSec	= ps->nrSubbandSamples();
   itsNrBeams		= ps->nrBeams();
+  itsNrPencilBeams	= ps->nrPencilBeams();
   itsNrPsets		= ps->nrPsets();
 
 #if defined DUMP_RAW_DATA
@@ -166,10 +167,10 @@ template<typename SAMPLE_TYPE> void InputSection<SAMPLE_TYPE>::preprocess(const
   itsSyncedStamp = TimeStamp(seconds, samples);
  
   if (itsNeedDelays) {
-    itsDelaysAtBegin.resize(itsNrBeams);
-    itsDelaysAfterEnd.resize(itsNrBeams);
-    itsBeamDirectionsAtBegin.resize(itsNrBeams);
-    itsBeamDirectionsAfterEnd.resize(itsNrBeams);
+    itsDelaysAtBegin.resize(itsNrBeams,itsNrPencilBeams);
+    itsDelaysAfterEnd.resize(itsNrBeams,itsNrPencilBeams);
+    itsBeamDirectionsAtBegin.resize(itsNrBeams,itsNrPencilBeams);
+    itsBeamDirectionsAfterEnd.resize(itsNrBeams,itsNrPencilBeams);
     
     itsDelayComp = new WH_DelayCompensation(ps, inputs[0].station, itsSyncedStamp); // TODO: support more than one station
 
@@ -186,8 +187,8 @@ template<typename SAMPLE_TYPE> void InputSection<SAMPLE_TYPE>::preprocess(const
   itsBBuffers.resize(itsNrInputs);
   itsDelayedStamps.resize(itsNrBeams);
   itsSamplesDelay.resize(itsNrBeams);
-  itsFineDelaysAtBegin.resize(itsNrBeams);
-  itsFineDelaysAfterEnd.resize(itsNrBeams);
+  itsFineDelaysAtBegin.resize(itsNrBeams,itsNrPencilBeams);
+  itsFineDelaysAfterEnd.resize(itsNrBeams,itsNrPencilBeams);
   itsFlags.resize(boost::extents[itsNrInputs][itsNrBeams]);
 
   for (unsigned rsp = 0; rsp < itsNrInputs; rsp ++)
@@ -236,24 +237,33 @@ template<typename SAMPLE_TYPE> void InputSection<SAMPLE_TYPE>::computeDelays()
     for (unsigned beam = 0; beam < itsNrBeams; beam ++) {
       // The coarse delay is determined for the center of the current
       // time interval and is expressed in an entire amount of samples.
-      const signed int coarseDelay = static_cast<signed int>(floor(0.5 * (itsDelaysAtBegin[beam] + itsDelaysAfterEnd[beam]) * itsSampleRate + 0.5));
-    
+      //
+      // We use the central pencil beam (#0) for the coarse delay compensation.
+      const signed int coarseDelay = static_cast<signed int>(floor(0.5 * (itsDelaysAtBegin[beam][0] + itsDelaysAfterEnd[beam][0]) * itsSampleRate + 0.5));
+
       // The fine delay is determined for the boundaries of the current
       // time interval and is expressed in seconds.
       const double d = coarseDelay * itsSampleDuration;
-    
+
       itsDelayedStamps[beam]      -= coarseDelay;
       itsSamplesDelay[beam]       = -coarseDelay;
-      itsFineDelaysAtBegin[beam]  = static_cast<float>(itsDelaysAtBegin[beam] - d);
-      itsFineDelaysAfterEnd[beam] = static_cast<float>(itsDelaysAfterEnd[beam] - d);
+
+      for (unsigned pencil = 0; pencil < itsNrPencilBeams; pencil++ ) {
+        // we don't do coarse delay compensation for the individual pencil beams to avoid complexity and overhead
+        itsFineDelaysAtBegin[beam][pencil]  = static_cast<float>(itsDelaysAtBegin[beam][pencil] - d);
+        itsFineDelaysAfterEnd[beam][pencil] = static_cast<float>(itsDelaysAfterEnd[beam][pencil] - d);
+      }
     }
 
     itsDelayTimer.stop();
   } else {
     for (unsigned beam = 0; beam < itsNrBeams; beam ++) {
       itsSamplesDelay[beam]       = 0;
-      itsFineDelaysAtBegin[beam]  = 0;
-      itsFineDelaysAfterEnd[beam] = 0;
+
+      for (unsigned pencil = 0; pencil < itsNrPencilBeams; pencil++ ) {
+        itsFineDelaysAtBegin[beam][pencil]  = 0;
+        itsFineDelaysAfterEnd[beam][pencil] = 0;
+      }
     }  
   }
 }
@@ -291,7 +301,7 @@ template<typename SAMPLE_TYPE> void InputSection<SAMPLE_TYPE>::writeLogMessage()
 
   if (itsDelayCompensation) {
     for (unsigned beam = 0; beam < itsNrBeams; beam ++)
-      logStr << (beam == 0 ? ", delays: [" : ", ") << PrettyTime(itsDelaysAtBegin[beam], 7);
+      logStr << (beam == 0 ? ", delays: [" : ", ") << PrettyTime(itsDelaysAtBegin[beam][0], 7);
       //logStr << (beam == 0 ? ", delays: [" : ", ") << PrettyTime(itsDelaysAtBegin[beam], 7) << " = " << itsSamplesDelay[beam] << " samples + " << PrettyTime(itsFineDelaysAtBegin[beam], 7);
 
     logStr << "]";
@@ -319,32 +329,43 @@ template<typename SAMPLE_TYPE> void InputSection<SAMPLE_TYPE>::toComputeNodes()
     command.write(stream);
 
     // create and send all metadata in one "large" message
-    std::vector<SubbandMetaData, AlignedStdAllocator<SubbandMetaData, 16> > metaData(itsNrPsets);
-
+    Vector<SubbandMetaData> metaData( itsNrPsets, 16, heapAllocator );
+    for( unsigned i = 0; i < itsNrPsets; i++ ) {
+      metaData[i] = SubbandMetaData( itsNrPencilBeams );
+    }
+    
     if( itsNeedDelays ) {
       for (unsigned pset = 0; pset < itsNrPsets; pset ++) {
         unsigned subband  = itsNSubbandsPerPset * pset + subbandBase;
+
 	if(subband < itsNSubbands) {
 	  unsigned rspBoard = itsSubbandToRSPboardMapping[subband];
 	  unsigned beam     = itsSubbandToBeamMapping[subband];
-	  const vector<double> &beamDirBegin = itsBeamDirectionsAtBegin[beam].coord().get();
-	  const vector<double> &beamDirEnd   = itsBeamDirectionsAfterEnd[beam].coord().get();
 
-	  metaData[pset].delayAtBegin   = itsFineDelaysAtBegin[beam];
-	  metaData[pset].delayAfterEnd  = itsFineDelaysAfterEnd[beam];
+          for( unsigned p = 0; p < itsNrPencilBeams; p++ ) {
+            struct SubbandMetaData::beamInfo &beamInfo = metaData[pset].beams[p];
 
-	  for( unsigned i = 0; i < 3; i++ ) {
-	    metaData[pset].beamDirectionAtBegin[i]  = beamDirBegin[i];
-	    metaData[pset].beamDirectionAfterEnd[i] = beamDirEnd[i];
-	  }
+	    beamInfo.delayAtBegin   = itsFineDelaysAtBegin[beam][p];
+	    beamInfo.delayAfterEnd  = itsFineDelaysAfterEnd[beam][p];
 
-	  metaData[pset].alignmentShift = itsBBuffers[rspBoard]->alignmentShift(beam);
+	    const vector<double> &beamDirBegin = itsBeamDirectionsAtBegin[beam][p].coord().get();
+	    const vector<double> &beamDirEnd   = itsBeamDirectionsAfterEnd[beam][p].coord().get();
+
+	    for( unsigned i = 0; i < 3; i++ ) {
+	      beamInfo.beamDirectionAtBegin[i]  = beamDirBegin[i];
+	      beamInfo.beamDirectionAfterEnd[i] = beamDirEnd[i];
+	    }
+	  }  
+
+	  metaData[pset].alignmentShift() = itsBBuffers[rspBoard]->alignmentShift(beam);
 	  metaData[pset].setFlags(itsFlags[rspBoard][beam]);
 	}
       }
     }
 
-    stream->write(&metaData[0], metaData.size() * sizeof(SubbandMetaData));
+    for( unsigned pset = 0; pset < itsNrPsets; pset++ ) {
+      metaData[pset].write( stream );
+    }
 
     // now send all subband data
     for (unsigned pset = 0; pset < itsNrPsets; pset ++) {
diff --git a/RTCP/IONProc/src/InputSection.h b/RTCP/IONProc/src/InputSection.h
index f13acdc1003..f6e6ab1f45d 100644
--- a/RTCP/IONProc/src/InputSection.h
+++ b/RTCP/IONProc/src/InputSection.h
@@ -30,6 +30,7 @@
 
 //# Includes
 #include <Interface/Parset.h>
+#include <Interface/MultiDimArray.h>
 #include <Interface/RSPTimeStamp.h>
 #include <Stream/Stream.h>
 #include <BeamletBuffer.h>
@@ -85,10 +86,10 @@ template <typename SAMPLE_TYPE> class InputSection {
     
     TimeStamp			 itsSyncedStamp;
    
-    std::vector<double>		 itsDelaysAtBegin;
-    std::vector<double>		 itsDelaysAfterEnd;
-    std::vector<AMC::Direction>	 itsBeamDirectionsAtBegin;
-    std::vector<AMC::Direction>	 itsBeamDirectionsAfterEnd;
+    Matrix<double>		 itsDelaysAtBegin;
+    Matrix<double>		 itsDelaysAfterEnd;
+    Matrix<AMC::Direction>	 itsBeamDirectionsAtBegin;
+    Matrix<AMC::Direction>	 itsBeamDirectionsAfterEnd;
     unsigned			 itsNrPsets;
     
     unsigned			 itsMaxNetworkDelay; // in samples
@@ -98,6 +99,7 @@ template <typename SAMPLE_TYPE> class InputSection {
     unsigned			 itsNHistorySamples;
     unsigned			 itsNrInputs;
     unsigned			 itsNrBeams;
+    unsigned			 itsNrPencilBeams;
 
     unsigned			 itsCurrentComputeCore, itsNrCoresPerPset;
     unsigned			 itsPsetNumber;
@@ -108,9 +110,10 @@ template <typename SAMPLE_TYPE> class InputSection {
 
     std::vector<TimeStamp>	 itsDelayedStamps;
     std::vector<signed int>	 itsSamplesDelay;
-    std::vector<float>		 itsFineDelaysAtBegin, itsFineDelaysAfterEnd;
     boost::multi_array<SparseSet<unsigned>, 2> itsFlags;
 
+    Matrix<float>		 itsFineDelaysAtBegin, itsFineDelaysAfterEnd;
+
     
     LogThread			 *itsLogThread;
     NSTimer			 itsDelayTimer;
diff --git a/RTCP/IONProc/src/WH_DelayCompensation.cc b/RTCP/IONProc/src/WH_DelayCompensation.cc
index 99f52c1cd87..0f37380cf4d 100644
--- a/RTCP/IONProc/src/WH_DelayCompensation.cc
+++ b/RTCP/IONProc/src/WH_DelayCompensation.cc
@@ -36,6 +36,7 @@
 #include <Common/LofarLogger.h>
 #include <Common/PrettyUnits.h>
 #include <Interface/Exceptions.h>
+#include <Interface/PencilCoordinates.h>
 
 namespace LOFAR 
 {
@@ -49,15 +50,16 @@ namespace LOFAR
                                                const string &stationName,
 					       const TimeStamp &startTime) :
       stop         (false),					       
-      itsBuffer    (bufferSize,ps->nrBeams()),
+      itsBuffer    (bufferSize,ps->nrBeams() * ps->nrPencilBeams()),
       head         (0),
       tail         (0),
       bufferFree   (bufferSize),
       bufferUsed   (0),
 
       itsNrCalcDelays    (ps->nrCalcDelays()),
-      itsNrBeams   (ps->nrBeams()),
-      itsStartTime (startTime),
+      itsNrBeams         (ps->nrBeams()),
+      itsNrPencilBeams   (ps->nrPencilBeams()),
+      itsStartTime       (startTime),
       itsNrSamplesPerSec (ps->nrSubbandSamples()),
       itsSampleDuration  (ps->sampleDuration()),
       itsStationName     (stationName),
@@ -150,14 +152,14 @@ namespace LOFAR
         ResultData result;
         converter->j2000ToItrf(result, request); // expensive
 
-        ASSERTSTR(result.direction.size() == itsNrCalcDelays * itsNrBeams,
-	  	  result.direction.size() << " == " << itsNrCalcDelays * itsNrBeams);
+        ASSERTSTR(result.direction.size() == itsNrCalcDelays * itsNrBeams * itsNrPencilBeams,
+	  	  result.direction.size() << " == " << itsNrCalcDelays * itsNrBeams * itsNrPencilBeams );
 
 	// append the results to the circular buffer
 	unsigned resultIndex = 0;
 
         for( unsigned t = 0; t < itsNrCalcDelays; t++ ) {
-	  for( unsigned b = 0; b < itsNrBeams; b++ ) {
+	  for( unsigned b = 0; b < itsNrBeams * itsNrPencilBeams; b++ ) {
 	    ASSERTSTR( tail < bufferSize, tail << " < " << bufferSize );
 
 	    itsBuffer[tail][b] = result.direction[resultIndex++];
@@ -181,23 +183,25 @@ namespace LOFAR
       delete converter;
     }
 
-    void WH_DelayCompensation::getNextDelays( vector<Direction> &directions, vector<double> &delays )
+    void WH_DelayCompensation::getNextDelays( Matrix<Direction> &directions, Matrix<double> &delays )
     {
-      ASSERTSTR(directions.size() == itsNrBeams,
-                directions.size() << " == " << itsNrBeams);
+      ASSERTSTR(directions.num_elements() == itsNrBeams * itsNrPencilBeams,
+                directions.num_elements() << " == " << itsNrBeams << "*" << itsNrPencilBeams);
 
-      ASSERTSTR(delays.size() == itsNrBeams,
-                delays.size() << " == " << itsNrBeams);
+      ASSERTSTR(delays.num_elements() == itsNrBeams * itsNrPencilBeams,
+                delays.num_elements() << " == " << itsNrBeams << "*" << itsNrPencilBeams);
 
       bufferUsed.down();
 
       // copy the directions at itsBuffer[head] into the provided buffer,
       // and calculate the respective delays
       for( unsigned b = 0; b < itsNrBeams; b++ ) {
-        const Direction &dir = itsBuffer[head][b];
+        for( unsigned p = 0; p < itsNrPencilBeams; p++ ) {
+          const Direction &dir = itsBuffer[head][b*itsNrPencilBeams + p];
 
-        directions[b] = dir;
-	delays[b] = dir * itsPhasePositionDiffs * (1.0 / speedOfLight);
+          directions[b][p] = dir;
+	  delays[b][p] = dir * itsPhasePositionDiffs * (1.0 / speedOfLight);
+	}  
       }
 
       // increment the head pointer
@@ -212,11 +216,18 @@ namespace LOFAR
 
     void WH_DelayCompensation::setBeamDirections(const Parset *ps)
     {
+      const PencilCoordinates& pencilBeams = ps->pencilBeams();
+
       // What coordinate system is used for these source directions?
       // Currently, we support J2000, ITRF, and AZEL.
       Direction::Types dirType(Direction::INVALID);
+
+      // TODO: For now, we include pencil beams for all regular beams,
+      // and use the pencil beam offsets as offsets in J2000.
+      // To do the coordinates properly, the offsets should be applied
+      // in today's coordinates (JMEAN/JTRUE?), not J2000.
       
-      itsBeamDirections.resize(itsNrBeams);
+      itsBeamDirections.resize(itsNrBeams*itsNrPencilBeams);
       
       // Get the source directions from the parameter set. 
       // Split the \a dir vector into separate Direction objects.
@@ -230,7 +241,17 @@ namespace LOFAR
         else THROW(IONProcException, "Observation.BeamDirectionType must be one of "
                        "J2000, ITRF, or AZEL");
 
-        itsBeamDirections[beam] = Direction(beamDir[0], beamDir[1], dirType);
+        for (unsigned pencil = 0; pencil < itsNrPencilBeams; pencil ++) {
+	  // obtain pencil coordinate
+	  const PencilCoord3D &pencilCoord = pencilBeams[pencil];
+
+	  // apply angle modification (TODO: different calculates for J2000 (ra/dec) and AZEL and ITRF!)
+	  const double angle1 = beamDir[0] + pencilCoord[0];
+	  const double angle2 = beamDir[1] + pencilCoord[1];
+
+	  // store beam
+          itsBeamDirections[beam*itsNrBeams + pencil] = Direction(angle1, angle2, dirType);
+	}
       }
     }
 
diff --git a/RTCP/IONProc/src/WH_DelayCompensation.h b/RTCP/IONProc/src/WH_DelayCompensation.h
index cfa11fb25e7..6427e7606e6 100644
--- a/RTCP/IONProc/src/WH_DelayCompensation.h
+++ b/RTCP/IONProc/src/WH_DelayCompensation.h
@@ -95,7 +95,8 @@ namespace LOFAR
       ~WH_DelayCompensation();
 
       // get the set of directions and delays for the beams, for the next CN integration time
-      void getNextDelays( vector<AMC::Direction> &directions, vector<double> &delays );
+      // Both matrices must have dimensions [itsNrBeams][itsNrPencilBeams]
+      void getNextDelays( Matrix<AMC::Direction> &directions, Matrix<double> &delays );
       
     private:
       // do the delay compensation calculations in a separate thread to allow bulk
@@ -137,6 +138,7 @@ namespace LOFAR
 
       // Beam info.
       const unsigned                itsNrBeams;
+      const unsigned                itsNrPencilBeams;
       vector<AMC::Direction>        itsBeamDirections;
 
       // Sample timings.
diff --git a/RTCP/Interface/include/Interface/CN_Configuration.h b/RTCP/Interface/include/Interface/CN_Configuration.h
index 5d1337ac777..646fff031d3 100644
--- a/RTCP/Interface/include/Interface/CN_Configuration.h
+++ b/RTCP/Interface/include/Interface/CN_Configuration.h
@@ -26,6 +26,7 @@
 #include <Stream/Stream.h>
 #include <Interface/MultiDimArray.h>
 #include <Interface/CN_Mode.h>
+#include <Interface/PencilCoordinates.h>
 #include <Interface/Parset.h>
 
 #include <vector>
@@ -57,17 +58,13 @@ class CN_Configuration
     double		  &sampleRate();
     std::vector<unsigned> &inputPsets(), &outputPsets(), &tabList();
     std::vector<double>	  &refFreqs();
-    unsigned              &nrPencilRings();
-    double                &pencilRingSize();
-    unsigned              &nrManualPencilBeams();
-    Matrix<double>        &manualPencilBeams();
+    PencilCoordinates     &pencilBeams();
     std::vector<double>   &refPhaseCentre();
     Matrix<double>        &phaseCentres();
     CN_Mode               &mode();
     bool                  &outputIncoherentStokesI();
     bool                  &stokesIntegrateChannels();
-
-    unsigned              nrPencilBeams() { return 3 * nrPencilRings() * (nrPencilRings() + 1) + 1 + nrManualPencilBeams(); }
+    unsigned              nrPencilBeams() const;
     
     void		  read(Stream *);
     void		  write(Stream *);
@@ -81,7 +78,7 @@ class CN_Configuration
     std::vector<unsigned> itsInputPsets, itsOutputPsets, itsTabList;
     std::vector<double>	  itsRefFreqs;
     std::vector<double>	  itsRefPhaseCentre;
-    Matrix<double>        itsManualPencilBeams;
+    PencilCoordinates     itsPencilBeams;
     Matrix<double>        itsPhaseCentres;
     CN_Mode               itsMode;
 
@@ -103,12 +100,10 @@ class CN_Configuration
       unsigned		  itsRefFreqsSize;
       unsigned		  itsInputPsets[MAX_PSETS], itsOutputPsets[MAX_PSETS], itsTabList[MAX_PSETS];
       double		  itsRefFreqs[MAX_SUBBANDS];
-      unsigned            itsNrPencilRings;
-      double              itsPencilRingSize;
       double              itsRefPhaseCentre[3];
       double              itsPhaseCentres[MAX_STATIONS * 3];
-      unsigned            itsNrManualPencilBeams;
-      double              itsManualPencilBeams[MAX_PENCILBEAMS * 2];
+      unsigned            itsNrPencilBeams;
+      double              itsPencilBeams[MAX_PENCILBEAMS * 2];
       bool                itsOutputIncoherentStokesI;
       bool                itsStokesIntegrateChannels;
     } itsMarshalledData;
@@ -195,24 +190,14 @@ inline std::vector<double> & CN_Configuration::refFreqs()
   return itsRefFreqs;
 }
 
-inline unsigned &CN_Configuration::nrPencilRings()
-{
-  return itsMarshalledData.itsNrPencilRings;
-}
-
-inline double &CN_Configuration::pencilRingSize()
-{
-  return itsMarshalledData.itsPencilRingSize;
-}
-
-inline unsigned &CN_Configuration::nrManualPencilBeams()
+inline PencilCoordinates &CN_Configuration::pencilBeams()
 {
-  return itsMarshalledData.itsNrManualPencilBeams;
+  return itsPencilBeams;
 }
 
-inline Matrix<double> &CN_Configuration::manualPencilBeams()
+inline unsigned CN_Configuration::nrPencilBeams() const
 {
-  return itsManualPencilBeams;
+  return itsPencilBeams.size();
 }
 
 inline std::vector<double> &CN_Configuration::refPhaseCentre()
diff --git a/RTCP/Interface/include/Interface/FilteredData.h b/RTCP/Interface/include/Interface/FilteredData.h
index 0975a50230a..b33e983070d 100644
--- a/RTCP/Interface/include/Interface/FilteredData.h
+++ b/RTCP/Interface/include/Interface/FilteredData.h
@@ -18,15 +18,15 @@ class FilteredData: public SampleData<fcomplex,4>
   public:
     typedef SampleData<fcomplex,4> SuperType;
 
-    FilteredData(const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration);
+    FilteredData(const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration, const unsigned nrStations);
 
-    virtual size_t requiredSize() const;
     virtual void allocate( Allocator &allocator = heapAllocator );
 
     Vector<SubbandMetaData>     metaData; //[itsNrStations]
 
   protected:
     const unsigned              itsNrStations;
+    const unsigned              itsNrPencilBeams;
     const unsigned              itsNrChannels;
     const unsigned              itsNrSamplesPerIntegration;
     virtual void readData( Stream* );
@@ -35,41 +35,45 @@ class FilteredData: public SampleData<fcomplex,4>
 
 
 
-inline FilteredData::FilteredData(const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration)
+inline FilteredData::FilteredData(const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration, const unsigned nrPencilBeams)
 :
   // The "| 2" significantly improves transpose speeds for particular
   // numbers of stations due to cache conflict effects.  The extra memory
   // is not used.
   SuperType::SampleData(false,boost::extents[nrChannels][nrStations][nrSamplesPerIntegration | 2][NR_POLARIZATIONS], nrStations),
+  metaData(nrStations,32),
   itsNrStations(nrStations),
+  itsNrPencilBeams(nrPencilBeams),
   itsNrChannels(nrChannels),
   itsNrSamplesPerIntegration(nrSamplesPerIntegration)
 {
 }
 
-inline size_t FilteredData::requiredSize() const
-{
-  return SuperType::requiredSize() +
-         align(sizeof(SubbandMetaData) * itsNrStations, 32);
-}
-
 inline void FilteredData::allocate( Allocator &allocator )
 {
   SuperType::allocate( allocator );
-  metaData.resize( itsNrStations, 32, allocator );
+  metaData.resize( itsNrStations, 32 );
+  
+  for( unsigned i = 0; i < itsNrStations; i++ ) {
+    metaData[i] = SubbandMetaData( itsNrPencilBeams );
+  }
 }
 
 
 inline void FilteredData::readData(Stream *str)
 {
-  str->read(&metaData[0], metaData.size() * sizeof(SubbandMetaData));
+  for( unsigned subband = 0; subband < itsNrStations; subband++ ) {
+    metaData[subband].read( str );
+  }
   SuperType::readData(str);
 }
 
 
 inline void FilteredData::writeData(Stream *str)
 {
-  str->write(&metaData[0], metaData.size() * sizeof(SubbandMetaData));
+  for( unsigned subband = 0; subband < itsNrStations; subband++ ) {
+    metaData[subband].write( str );
+  }
   SuperType::writeData(str);
 }
 
diff --git a/RTCP/Interface/include/Interface/Makefile.am b/RTCP/Interface/include/Interface/Makefile.am
index 7ef160e42f7..b1b02a8d468 100644
--- a/RTCP/Interface/include/Interface/Makefile.am
+++ b/RTCP/Interface/include/Interface/Makefile.am
@@ -21,6 +21,7 @@ pkginclude_HEADERS = Package__Version.h \
 		     PencilBeamData.h \
 		     PencilCoordinates.h \
                      StokesData.h \
-                     PipelineOutput.h
+                     PipelineOutput.h \
+                     PencilCoordinates.h
 
 include $(top_srcdir)/Makefile.common
diff --git a/RTCP/Interface/include/Interface/Parset.h b/RTCP/Interface/include/Interface/Parset.h
index d3f4fdd5ef2..26cbf6c622e 100644
--- a/RTCP/Interface/include/Interface/Parset.h
+++ b/RTCP/Interface/include/Interface/Parset.h
@@ -36,6 +36,7 @@
 #include <Common/lofar_datetime.h>
 #include <Common/LofarLogger.h> 
 #include <Interface/Config.h>
+#include <Interface/PencilCoordinates.h>
 #include <ApplCommon/Observation.h>
 #include <Stream/Stream.h>
 
@@ -110,12 +111,8 @@ public:
 	bool           outputIncoherentStokesI() const;
 	bool           stokesIntegrateChannels() const;
 
-	uint32         nrManualPencilBeams() const;
-	vector<double> getManualPencilBeam( const unsigned pencil ) const;
-	uint32	       nrPencilRings() const;
-	double	       pencilRingSize() const;
-
 	uint32	       nrPencilBeams() const;
+	PencilCoordinates pencilBeams() const;
 
 	unsigned         nrSubbands() const;
 	unsigned         nrBeams() const;
@@ -150,6 +147,11 @@ public:
 	vector<double> itsStPositions;
 	
 private:
+	uint32         nrManualPencilBeams() const;
+	vector<double> getManualPencilBeam( const unsigned pencil ) const;
+	uint32	       nrPencilRings() const;
+	double	       pencilRingSize() const;
+
 	void           addPosition(string stName);
 	double	       getTime(const char *name) const;
 	static int     findIndex(uint32 pset, const vector<uint32> &psets);
@@ -435,6 +437,17 @@ inline uint32 Parset::nrPencilBeams() const
   return 3 * nrPencilRings() * (nrPencilRings() + 1) + 1 + nrManualPencilBeams();
 }
 
+inline PencilCoordinates Parset::pencilBeams() const
+{
+  // include both the pencil rings and the manually defined pencil beam coordinates
+  PencilRings coordinates( nrPencilRings(), pencilRingSize() );
+  for( unsigned i = 0; i < nrManualPencilBeams(); i++ ) {
+    coordinates += PencilCoord3D( getManualPencilBeam( i ) );
+  }
+
+  return coordinates;
+}
+
 inline double Parset::pencilRingSize() const
 {
   return getDouble("Observation.pencilRingSize");
diff --git a/RTCP/Interface/include/Interface/PencilCoordinates.h b/RTCP/Interface/include/Interface/PencilCoordinates.h
index 5e7c9b7f3c5..984267478b4 100644
--- a/RTCP/Interface/include/Interface/PencilCoordinates.h
+++ b/RTCP/Interface/include/Interface/PencilCoordinates.h
@@ -89,7 +89,12 @@ class PencilCoord3D {
       return *this;
     }
 
-    inline double& operator[]( const unsigned i )
+    inline const double operator[]( const unsigned i ) const
+    {
+      return itsXYZ[i];
+    }
+
+    inline double &operator[]( const unsigned i )
     {
       return itsXYZ[i];
     }
@@ -117,12 +122,15 @@ class PencilCoordinates
     PencilCoordinates( const std::vector<PencilCoord3D> &coordinates ): itsCoordinates(coordinates) {}
     PencilCoordinates( const Matrix<double> &coordinates );
 
-    std::vector<PencilCoord3D>& getCoordinates()
+    inline std::vector<PencilCoord3D>& getCoordinates() 
     { return itsCoordinates; }
 
-    size_t size() const
+    inline size_t size() const
     { return itsCoordinates.size(); }
 
+    inline const PencilCoord3D& operator[]( const unsigned nr ) const
+    { return itsCoordinates[nr]; }
+
     void read( Stream *s );
     void write( Stream *s ) const;
 
diff --git a/RTCP/Interface/include/Interface/PipelineOutput.h b/RTCP/Interface/include/Interface/PipelineOutput.h
index 240725b21e9..eb8939c6bcd 100644
--- a/RTCP/Interface/include/Interface/PipelineOutput.h
+++ b/RTCP/Interface/include/Interface/PipelineOutput.h
@@ -128,7 +128,7 @@ inline PipelineOutputSet::PipelineOutputSet( const Parset &ps, Allocator &alloca
   switch( mode.mode() ) {
     case CN_Mode::FILTER:
 	o = new PipelineOutput( id++, PipelineOutput::FILTEREDDATA );
-        o->itsData = new FilteredData( ps.nrStations(), ps.nrChannelsPerSubband(), ps.CNintegrationSteps() );
+        o->itsData = new FilteredData( ps.nrStations(), ps.nrChannelsPerSubband(), ps.CNintegrationSteps(), ps.nrPencilBeams() );
         break;
 
     case CN_Mode::CORRELATE:
diff --git a/RTCP/Interface/include/Interface/SubbandMetaData.h b/RTCP/Interface/include/Interface/SubbandMetaData.h
index ab7ff4290e5..1e9115f111e 100644
--- a/RTCP/Interface/include/Interface/SubbandMetaData.h
+++ b/RTCP/Interface/include/Interface/SubbandMetaData.h
@@ -24,7 +24,9 @@
 #define LOFAR_INTERFACE_SUBBAND_META_DATA_H
 
 #include <Interface/SparseSet.h>
+#include <Interface/MultiDimArray.h>
 #include <Stream/Stream.h>
+#include <Common/LofarLogger.h>
 
 #include <cassert>
 
@@ -32,38 +34,101 @@
 namespace LOFAR {
 namespace RTCP {
 
+// Note: struct must remain copyable to avoid ugly constructions when passing it around
 struct SubbandMetaData
 {
   public:
+    SubbandMetaData();
+    SubbandMetaData( const unsigned nrBeams );
+
     SparseSet<unsigned>	getFlags() const;
     void		setFlags(const SparseSet<unsigned> &);
 
-    float		delayAtBegin, delayAfterEnd;
-    double              beamDirectionAtBegin[3], beamDirectionAfterEnd[3];
+    unsigned            alignmentShift() const;
+    unsigned            &alignmentShift();
+
+    void read( Stream *str );
+    void write( Stream *str ) const;
+
+    struct beamInfo {
+      float delayAtBegin, delayAfterEnd;
+      double beamDirectionAtBegin[3], beamDirectionAfterEnd[3];
+    };
 
-    unsigned		alignmentShift;
+    // Note: CNProc/AsyncTranspose reads the information below directly
+    std::vector<struct beamInfo> beams;
 
-  private:
-    unsigned char	flagsBuffer[132];
+    struct marshallData {
+      unsigned char	flagsBuffer[132];
+      unsigned		alignmentShift;
+    } itsMarshalledData;
 };
 
+inline SubbandMetaData::SubbandMetaData()
+{
+  // Used for constructing vectors, such as Vector<SubbandMetaData> metaData( itsNrStations, 32, allocator )
+  // Without a default constructor, construction of such an array is possible, but quite awkward as it
+  // requires the manual construction and destruction of each element:
+  //
+  // SubbandMetaData *metaData = allocator.allocate(itsNrStations*sizeof(SubbandMetaData),32)
+  // for( unsigned i = 0; i < itsNrStations; i++ ) {
+  //   metaData[i] = new SubbandMetaData( ... );
+  // }
+  // ...
+  // for( unsigned i = 0; i < itsNrStations; i++ ) {
+  //   delete metaData[i];
+  // }
+  // allocator.deallocate( metaData );
+  //
+  // NOTE: While C++ has a new[](...) construction especially for this use case, it is not part of ISO C++.
+}
+
+inline SubbandMetaData::SubbandMetaData( const unsigned nrBeams )
+: 
+  beams(nrBeams)
+{
+}
 
 inline SparseSet<unsigned> SubbandMetaData::getFlags() const
 {
   SparseSet<unsigned> flags;
 
-  flags.unmarshall(flagsBuffer);
+  flags.unmarshall(itsMarshalledData.flagsBuffer);
   return flags;
 }
 
-
 inline void SubbandMetaData::setFlags(const SparseSet<unsigned> &flags)
 {
-  ssize_t size = flags.marshall(&flagsBuffer, sizeof flagsBuffer);
+  ssize_t size = flags.marshall(&itsMarshalledData.flagsBuffer, sizeof itsMarshalledData.flagsBuffer);
   
   assert(size >= 0);
 }
 
+inline unsigned SubbandMetaData::alignmentShift() const
+{
+  return itsMarshalledData.alignmentShift;
+}
+
+inline unsigned &SubbandMetaData::alignmentShift()
+{
+  return itsMarshalledData.alignmentShift;
+}
+
+inline void SubbandMetaData::read( Stream *str )
+{
+  // TODO: endianness
+
+  str->read(&itsMarshalledData, sizeof itsMarshalledData);
+  str->read(&beams.front(), beams.size() * sizeof beams[0] );
+}
+
+inline void SubbandMetaData::write( Stream *str ) const
+{
+  // TODO: endianness
+
+  str->write(&itsMarshalledData, sizeof itsMarshalledData);
+  str->write(&beams.front(), beams.size() * sizeof beams[0] );
+}
 
 } // namespace RTCP
 } // namespace LOFAR
diff --git a/RTCP/Interface/src/CN_Configuration.cc b/RTCP/Interface/src/CN_Configuration.cc
index 502d31b8280..01e75ca82f7 100644
--- a/RTCP/Interface/src/CN_Configuration.cc
+++ b/RTCP/Interface/src/CN_Configuration.cc
@@ -47,9 +47,7 @@ CN_Configuration::CN_Configuration( const Parset &parset )
   outputPsets()             = parset.getUint32Vector("OLAP.CNProc.outputPsets");
   tabList()                 = parset.getUint32Vector("OLAP.CNProc.tabList");
   refFreqs()                = parset.subbandToFrequencyMapping();
-  nrPencilRings()           = parset.nrPencilRings();
-  pencilRingSize()          = parset.pencilRingSize();
-  nrManualPencilBeams()     = parset.nrManualPencilBeams();
+  pencilBeams()             = parset.pencilBeams();
   refPhaseCentre()          = parset.getRefPhaseCentres();
   mode()                    = CN_Mode(parset);
   outputIncoherentStokesI() = parset.outputIncoherentStokesI();
@@ -68,15 +66,6 @@ CN_Configuration::CN_Configuration( const Parset &parset )
       itsPhaseCentres[stat][dim] = positions[stat*3+dim];
     }
   }
-
-  itsManualPencilBeams.resize( parset.nrManualPencilBeams(), 2 );
-  for( unsigned beam = 0; beam < parset.nrManualPencilBeams(); beam++ ) {
-    std::vector<double> coordinates = parset.getManualPencilBeam( beam );
-
-    for( unsigned dim = 0; dim < 2; dim++ ) {
-      itsManualPencilBeams[beam][dim] = coordinates[dim];
-    }
-  }
 }
 
 #endif
@@ -86,6 +75,7 @@ void CN_Configuration::read(Stream *str)
 {
   str->read(&itsMarshalledData, sizeof itsMarshalledData);
   itsMode.read(str);
+  itsPencilBeams.read(str);
 
   itsInputPsets.resize(itsMarshalledData.itsInputPsetsSize);
   memcpy(&itsInputPsets[0], itsMarshalledData.itsInputPsets, itsMarshalledData.itsInputPsetsSize * sizeof(unsigned));
@@ -106,11 +96,6 @@ void CN_Configuration::read(Stream *str)
   for( unsigned stat = 0; stat < nrStations(); stat++ ) {
     memcpy(&itsPhaseCentres[stat][0], &itsMarshalledData.itsPhaseCentres[stat*3], 3 * sizeof(double));
   }
-
-  itsManualPencilBeams.resize(nrManualPencilBeams(),2);
-  for( unsigned beam = 0; beam < nrManualPencilBeams(); beam++ ) {
-    memcpy(&itsManualPencilBeams[beam][0], &itsMarshalledData.itsManualPencilBeams[beam*2], 2 * sizeof(double));
-  }
 }
 
 
@@ -137,12 +122,9 @@ void CN_Configuration::write(Stream *str)
     memcpy(&itsMarshalledData.itsPhaseCentres[stat*3], &itsPhaseCentres[stat][0], 3 * sizeof(double));
   }
 
-  for( unsigned beam = 0; beam < nrManualPencilBeams(); beam++ ) {
-    memcpy(&itsMarshalledData.itsManualPencilBeams[beam*2], &itsManualPencilBeams[beam][0], 2 * sizeof(double));
-  }
-
   str->write(&itsMarshalledData, sizeof itsMarshalledData);
   itsMode.write(str);
+  itsPencilBeams.write(str);
 }
 
 } // namespace RTCP
diff --git a/RTCP/Interface/src/PencilCoordinates.cc b/RTCP/Interface/src/PencilCoordinates.cc
index 2ab0728098e..e7fc46a09d1 100644
--- a/RTCP/Interface/src/PencilCoordinates.cc
+++ b/RTCP/Interface/src/PencilCoordinates.cc
@@ -153,7 +153,7 @@ double PencilRings::pencilHeightDelta() const
 
 void PencilRings::computeBeamCoordinates()
 {
-  std::vector<PencilCoord3D> &coordinates = getCoordinates();
+  std::vector<PencilCoord3D> coordinates;
   double dl[6], dm[6];
 
   // stride for each side, starting from the top, clock-wise
@@ -203,7 +203,6 @@ void PencilRings::computeBeamCoordinates()
   dm[5] = pencilHeightDelta();
 
   // ring 0: the center pencil beam
-  coordinates.reserve(nrPencils());
   coordinates.push_back( PencilCoord3D( 0, 0 ) );
 
   // ring 1-n: create the pencil beams from the inner ring outwards
@@ -219,6 +218,8 @@ void PencilRings::computeBeamCoordinates()
       }
     }
   }
+
+  *this += coordinates;
 }
 
 }
-- 
GitLab