diff --git a/RTCP/CNProc/src/ArenaMapping.h b/RTCP/CNProc/src/ArenaMapping.h
index b8b4dfc3726b4c1ba2d7470a911eaae99290e760..ca3844220c369c88fed9831bc06022460eef998e 100644
--- a/RTCP/CNProc/src/ArenaMapping.h
+++ b/RTCP/CNProc/src/ArenaMapping.h
@@ -28,6 +28,7 @@
 #include <Interface/Allocator.h>
 #include <Interface/StreamableData.h>
 #include <boost/noncopyable.hpp>
+#include <Common/LofarLogger.h>
 
 namespace LOFAR {
 namespace RTCP {
diff --git a/RTCP/CNProc/src/AsyncCommunication.cc b/RTCP/CNProc/src/AsyncCommunication.cc
index 7dad61c8758b053394bdd055c957bb38e1dda2e6..ba9c75cc09f7b357f54fa26d951294adaf4752a9 100644
--- a/RTCP/CNProc/src/AsyncCommunication.cc
+++ b/RTCP/CNProc/src/AsyncCommunication.cc
@@ -4,6 +4,7 @@
 #include <AsyncCommunication.h>
 
 #include <Common/Timer.h>
+#include <Interface/Exceptions.h>
 
 #include <cassert>
 #include <map>
@@ -17,14 +18,10 @@ namespace RTCP {
 #if defined HAVE_MPI
 
 AsyncCommunication::AsyncCommunication(MPI_Comm comm)
-{
-    itsCommunicator = comm;
-    itsCurrentReadHandle = 0;
-    itsCurrentWriteHandle = 0;
-}
-
-
-AsyncCommunication::~AsyncCommunication()
+:
+  itsCommunicator(comm),
+  itsCurrentReadHandle(0),
+  itsCurrentWriteHandle(0)
 {
 }
 
@@ -35,8 +32,7 @@ int AsyncCommunication::asyncRead(void* buf, unsigned size, unsigned source, int
 
     int res = MPI_Irecv(buf, size, MPI_BYTE, source, tag, itsCommunicator, &req->mpiReq);
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Irecv() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Irecv() failed");
     }
 
     req->buf = buf;
@@ -57,8 +53,7 @@ int AsyncCommunication::asyncWrite(const void* buf, unsigned size, unsigned dest
 
     int res = MPI_Isend((void*)buf, size, MPI_BYTE, dest, tag, itsCommunicator, &req->mpiReq);
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Isend() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Isend() failed");
     }
 
     req->buf = (void*)buf;
@@ -80,8 +75,7 @@ void AsyncCommunication::waitForRead(int handle)
 
     int res = MPI_Wait(&req->mpiReq, &status);
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Wait() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Wait() failed");
     }
 
     // done, now remove from map, and free req
@@ -96,8 +90,7 @@ void AsyncCommunication::waitForWrite(int handle)
 
     int res = MPI_Wait(&req->mpiReq, &status);
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Wait() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Wait() failed");
     }
 
     // done, now remove from map, and free req
@@ -112,6 +105,8 @@ int AsyncCommunication::waitForAnyRead(void*& buf, unsigned& size, unsigned& sou
     int count = itsReadHandleMap.size();
     MPI_Request reqs[count];
     int mapping[count];
+
+    ASSERT( count > 0 );
     
     int i = 0;
     for (std::map<int, AsyncRequest*>::const_iterator it = itsReadHandleMap.begin(); it != itsReadHandleMap.end(); it++) {
@@ -129,8 +124,11 @@ int AsyncCommunication::waitForAnyRead(void*& buf, unsigned& size, unsigned& sou
     waitAnyTimer.stop();
 
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Waitany() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Waitany() failed");
+    }
+
+    if (index == MPI_UNDEFINED) {
+	THROW(CNProcException,"MPI_Waitany() failed: no (pending) receives");
     }
 
     int handle = mapping[index];
@@ -153,6 +151,11 @@ void AsyncCommunication::waitForAllReads()
     MPI_Request reqs[count];
     MPI_Status status[count];
 
+    if( count == 0 ) {
+      // nothing to wait for
+      return;
+    }
+
     int i = 0;
     for (std::map<int, AsyncRequest*>::const_iterator it = itsReadHandleMap.begin(); it != itsReadHandleMap.end(); it++) {
 	AsyncRequest* r = it->second;
@@ -162,8 +165,7 @@ void AsyncCommunication::waitForAllReads()
 
     int res = MPI_Waitall(count, reqs, status);
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Waitall() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Waitall() failed");
     }
 
     for (std::map<int, AsyncRequest*>::const_iterator it = itsReadHandleMap.begin(); it != itsReadHandleMap.end(); it++) {
@@ -181,6 +183,11 @@ void AsyncCommunication::waitForAllWrites()
     MPI_Request reqs[count];
     MPI_Status status[count];
 
+    if( count == 0 ) {
+      // nothing to wait for
+      return;
+    }
+
     int i = 0;
     for (std::map<int, AsyncRequest*>::const_iterator it = itsWriteHandleMap.begin(); it != itsWriteHandleMap.end(); it++) {
 	AsyncRequest* r = it->second;
@@ -190,8 +197,7 @@ void AsyncCommunication::waitForAllWrites()
 
     int res = MPI_Waitall(count, reqs, status);
     if (res != MPI_SUCCESS) {
-	LOG_FATAL("MPI_Waitall() failed");
-	exit(1);
+	THROW(CNProcException,"MPI_Waitall() failed");
     }
 
     for (std::map<int, AsyncRequest*>::const_iterator it = itsWriteHandleMap.begin(); it != itsWriteHandleMap.end(); it++) {
diff --git a/RTCP/CNProc/src/AsyncCommunication.h b/RTCP/CNProc/src/AsyncCommunication.h
index 6fd69d096e6ed79282e721032e15f537e1ba1eb4..0434916e633f2bbb59735a17bcb6fb4ffe6abd53 100644
--- a/RTCP/CNProc/src/AsyncCommunication.h
+++ b/RTCP/CNProc/src/AsyncCommunication.h
@@ -26,7 +26,6 @@ public:
 class AsyncCommunication: boost::noncopyable {
   public:
     AsyncCommunication(MPI_Comm communicator = MPI_COMM_WORLD);
-    ~AsyncCommunication();
 
     // returns handle to this read
     int asyncRead(void* buf, unsigned size, unsigned source, int tag);
diff --git a/RTCP/CNProc/src/AsyncTranspose.cc b/RTCP/CNProc/src/AsyncTranspose.cc
index 4f0e2e59043a222eea39b7cf03319a19cb6012d8..635d3349633bc02c896de13b2267ed7fb9135d4c 100644
--- a/RTCP/CNProc/src/AsyncTranspose.cc
+++ b/RTCP/CNProc/src/AsyncTranspose.cc
@@ -5,6 +5,7 @@
 
 #include <Interface/CN_Mapping.h>
 #include <Interface/PrintVector.h>
+#include <Common/LofarLogger.h>
 
 #include <cassert>
 
@@ -14,53 +15,55 @@ namespace RTCP {
 
 #if defined HAVE_MPI
 
-#define MAX_TAG 100000 // The maximum tag we use to represent a data message. 
-                       // Higher tags are metadata.
+#define MAX_RANK 100000 // used for message identification: id = type*MAX_RANK + rank
 
 template <typename SAMPLE_TYPE> AsyncTranspose<SAMPLE_TYPE>::AsyncTranspose(
   const bool isTransposeInput, const bool isTransposeOutput, 
   const unsigned groupNumber, const LocationInfo &locationInfo, 
   const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets, 
-  const unsigned nrSamplesToCNProc, const unsigned nrSubbands, const unsigned nrSubbandsPerPset)
+  const unsigned nrSubbands, const unsigned nrSubbandsPerPset, const unsigned nrPencilBeams )
 :
   itsIsTransposeInput(isTransposeInput),
   itsIsTransposeOutput(isTransposeOutput),
   itsNrSubbands(nrSubbands),
   itsNrSubbandsPerPset(nrSubbandsPerPset),
+  itsNrPencilBeams(nrPencilBeams),
+  itsAsyncComm(),
   itsInputPsets(inputPsets),
   itsOutputPsets(outputPsets),
   itsLocationInfo(locationInfo),
+  itsCommHandles(itsNrCommunications,inputPsets.size()),
   itsGroupNumber(groupNumber)
 {
-  InputData<SAMPLE_TYPE> oneSample( 1, nrSamplesToCNProc );
-
   for(unsigned i=0; i<inputPsets.size(); i++) {
     const unsigned rank = locationInfo.remapOnTree(inputPsets[i], itsGroupNumber);
     itsRankToPsetIndex[rank] = i;
   }
-
-  itsMessageSize = oneSample.requiredSize();
-  dataHandles.resize(inputPsets.size());
-  metaDataHandles.resize(inputPsets.size());
-  itsAsyncComm = new AsyncCommunication();
 }
 
-
-template <typename SAMPLE_TYPE> AsyncTranspose<SAMPLE_TYPE>::~AsyncTranspose()
-{
-    delete itsAsyncComm;
-}
-
-
 template <typename SAMPLE_TYPE> void AsyncTranspose<SAMPLE_TYPE>::postAllReceives(TransposedData<SAMPLE_TYPE> *transposedData)
 {
+    // there must be something to receive
+    ASSERT( itsInputPsets.size() > 0 );
+
     for(unsigned i=0; i<itsInputPsets.size(); i++) {
-      void* buf = (void*) transposedData->samples[i].origin();
       unsigned pset = itsInputPsets[i];
       unsigned rank = itsLocationInfo.remapOnTree(pset, itsGroupNumber); // TODO cache this? maybe in locationInfo itself?
 
-      dataHandles[i] = itsAsyncComm->asyncRead(buf, itsMessageSize, rank, rank);
-      metaDataHandles[i] = itsAsyncComm->asyncRead(&transposedData->metaData[i], sizeof(SubbandMetaData), rank, rank + MAX_TAG);
+      // define what to read
+      struct {
+        void *ptr;
+        size_t size;
+      } toRead[itsNrCommunications] = {
+        { transposedData->samples[i].origin(), transposedData->samples[i].num_elements() * sizeof(SAMPLE_TYPE) },
+        { &transposedData->metaData[i].itsMarshalledData, sizeof(SubbandMetaData::marshallData) },
+        { &transposedData->metaData[i].beams.front(), itsNrPencilBeams * sizeof(SubbandMetaData::beamInfo) }
+      };
+
+      // read it
+      for( unsigned h = 0; h < itsNrCommunications; h++ ) {
+        itsCommHandles[h][i] = itsAsyncComm.asyncRead(toRead[h].ptr, toRead[h].size, rank, rank + h*MAX_RANK);
+      }
     }
 }
 
@@ -73,23 +76,33 @@ template <typename SAMPLE_TYPE> unsigned AsyncTranspose<SAMPLE_TYPE>::waitForAny
     unsigned size, source;
     int tag;
 
-    // This read could return either a data message, or a meta data message.
-    itsAsyncComm->waitForAnyRead(buf, size, source, tag);
+    // This read could return any type of message (out of itsCommHandles)
+    itsAsyncComm.waitForAnyRead(buf, size, source, tag);
 
     // source is the real rank, calc pset index
     const unsigned psetIndex = itsRankToPsetIndex[source];
 
-    if(tag < MAX_TAG) { // real data message
-      dataHandles[psetIndex] = -1; // record that we have received the data
-      if(metaDataHandles[psetIndex] == -1) { // We already have the metadata
-	return psetIndex;
+    // mark the right communication handle as received
+    for( unsigned h = 0; h < itsNrCommunications; h++ ) {
+      if( static_cast<unsigned>(tag) < (h+1) * MAX_RANK ) {
+        itsCommHandles[h][psetIndex] = -1;
+        break;
       }
-    } else { // metadata message
-      metaDataHandles[psetIndex] = -1; // record that we have received the metadata
-      if(dataHandles[psetIndex] == -1) {
-	return psetIndex; // We already have the data
+    }
+
+    // check whether we have received all communications for this psetIndex.
+    // This is the case when commHandles are -1.
+    bool haveAll = true;
+    for( unsigned h = 0; h < itsNrCommunications; h++ ) {
+      if( itsCommHandles[h][psetIndex] != -1 ) {
+        haveAll = false;
+        break;
       }
     }
+
+    if( haveAll ) {
+      return psetIndex;
+    }
   }
 }
 
@@ -101,15 +114,27 @@ template <typename SAMPLE_TYPE> void AsyncTranspose<SAMPLE_TYPE>::asyncSend(cons
   const unsigned rank = itsLocationInfo.remapOnTree(pset, itsGroupNumber);
   const int tag = itsLocationInfo.rank();
 
-  itsAsyncComm->asyncWrite(inputData->samples[outputPsetNr].origin(), itsMessageSize, rank, tag);
-  itsAsyncComm->asyncWrite(&inputData->metaData[outputPsetNr], sizeof(SubbandMetaData), rank, tag + MAX_TAG);
+  // define what to write
+  struct {
+    const void *ptr;
+    const size_t size;
+  } toWrite[itsNrCommunications] = {
+    { inputData->samples[outputPsetNr].origin(), inputData->samples[outputPsetNr].num_elements() * sizeof(SAMPLE_TYPE) },
+    { &inputData->metaData[outputPsetNr].itsMarshalledData, sizeof(SubbandMetaData::marshallData) },
+    { &inputData->metaData[outputPsetNr].beams.front(), itsNrPencilBeams * sizeof(SubbandMetaData::beamInfo) }
+  };
+
+  // write it
+  for( unsigned h = 0; h < itsNrCommunications; h++ ) {
+    itsAsyncComm.asyncWrite( toWrite[h].ptr, toWrite[h].size, rank, tag + h*MAX_RANK );
+  }
 }
 
 
 template <typename SAMPLE_TYPE> void AsyncTranspose<SAMPLE_TYPE>::waitForAllSends()
 {
   // this includes the metadata writes...
-  itsAsyncComm->waitForAllWrites();
+  itsAsyncComm.waitForAllWrites();
 }
 
   
diff --git a/RTCP/CNProc/src/AsyncTranspose.h b/RTCP/CNProc/src/AsyncTranspose.h
index 6563177b672720afba42d1888a7c524b1adbc609..967ffa9a927b9789c68bf7b2a02a598b18890432 100644
--- a/RTCP/CNProc/src/AsyncTranspose.h
+++ b/RTCP/CNProc/src/AsyncTranspose.h
@@ -38,10 +38,8 @@ template <typename SAMPLE_TYPE> class AsyncTranspose
 
   AsyncTranspose(const bool isTransposeInput, const bool isTransposeOutput, 
 		 const unsigned groupNumber, const LocationInfo &, 
-		 const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets, const unsigned nrSamplesToCNProc, 
-		 const unsigned nrSubbands, const unsigned nrSubbandsPerPset);
-
-  ~AsyncTranspose();
+		 const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets, 
+		 const unsigned nrSubbands, const unsigned nrSubbandsPerPset, const unsigned nrPencilBeams );
   
   // Post all async receives for the transpose.
   void postAllReceives(TransposedData<SAMPLE_TYPE> *transposedData);
@@ -61,24 +59,24 @@ template <typename SAMPLE_TYPE> class AsyncTranspose
 
   unsigned itsNrSubbands;
   unsigned itsNrSubbandsPerPset;
+  unsigned itsNrPencilBeams;
 
-  // the size of a data message
-  unsigned itsMessageSize; 
-  
   // A mapping that tells us, if we receive a message from a source,
   // to which pset that source belongs.
   std::map<unsigned, unsigned> itsRankToPsetIndex; 
 
-  AsyncCommunication* itsAsyncComm;
+  AsyncCommunication itsAsyncComm;
   const std::vector<unsigned> &itsInputPsets;
-  const  std::vector<unsigned> &itsOutputPsets;
+  const std::vector<unsigned> &itsOutputPsets;
   const LocationInfo &itsLocationInfo;
 
-  // Two maps that contain the handles to the asynchronous reads.
+  // The number of communicates (writes/reads) needed to transport one sub band.
+  static const unsigned itsNrCommunications = 3;
+
+  // The maps that contain the handles to the asynchronous reads.
   // The maps are indexed by the inputPset index.
   // The value is -1 if the read finished.
-  std::vector<int> dataHandles;
-  std::vector<int> metaDataHandles;
+  Matrix<int> itsCommHandles; // [itsNrCommunications][itsNrInputPsets]
 
   // The number of the transpose group we belong to.
   // The cores with the same index in a pset together form a group.
diff --git a/RTCP/CNProc/src/BeamFormerAsm.h b/RTCP/CNProc/src/BeamFormerAsm.h
index 4cc368fddf2ce386010b86dc19f7e07254bf2617..32db2a4164ce691c7e7492542d5e49e5e849103a 100644
--- a/RTCP/CNProc/src/BeamFormerAsm.h
+++ b/RTCP/CNProc/src/BeamFormerAsm.h
@@ -1,5 +1,7 @@
 #if defined HAVE_BGP
 
+#include <cstring>
+
 namespace LOFAR {
 namespace RTCP {
 
@@ -129,6 +131,33 @@ void _beamform_6beams_2times(
 
 } // extern "C"
 
+// Similar functions that do not need or have an ASM version
+
+// defined just to aid the use of macros
+static inline void _add_1_single_precision_vectors(
+  float *dst,
+  const float *src1,
+  unsigned count /* non-zero; multiple of 16 */
+) {
+  // nothing to add, so just copy the values
+  memcpy( dst, src1, count * sizeof(float) );
+}
+
+static inline void _add_7_single_precision_vectors(
+  float *dst,
+  const float *src1,
+  const float *src2,
+  const float *src3,
+  const float *src4,
+  const float *src5,
+  const float *src6,
+  const float *src7,
+  unsigned count /* non-zero; multiple of 16 */
+) {
+  _add_4_single_precision_vectors( dst, src1, src2, src3, src4, count );
+  _add_4_single_precision_vectors( dst, dst,  src5, src6, src7, count );
+}
+
 } // namespace LOFAR::RTCP
 } // namespace LOFAR
 
diff --git a/RTCP/CNProc/src/CN_Processing.cc b/RTCP/CNProc/src/CN_Processing.cc
index 7e4277757369a42d182314711cab36732a04279a..3e5e52131a9e048c877a34554f8cfb8e63164c4c 100644
--- a/RTCP/CNProc/src/CN_Processing.cc
+++ b/RTCP/CNProc/src/CN_Processing.cc
@@ -264,6 +264,7 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
   itsIsTransposeOutput = outputPsetIndex != outputPsets.end();
 
   itsNrStations	                   = configuration.nrStations();
+  itsNrPencilBeams                 = configuration.nrPencilBeams();
   itsNrSubbands                    = configuration.nrSubbands();
   itsNrSubbandsPerPset             = configuration.nrSubbandsPerPset();
   itsMode                          = configuration.mode();
@@ -283,10 +284,6 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
   const unsigned nrBeamFormedStations = itsBeamFormer->getNrBeamFormedStations();
   const unsigned nrBaselines = nrBeamFormedStations * (nrBeamFormedStations + 1) / 2;
 
-  // include both the pencil rings and the manually defined pencil beam coordinates
-  PencilRings pencilCoordinates( configuration.nrPencilRings(), configuration.pencilRingSize() );
-  pencilCoordinates += PencilCoordinates( configuration.manualPencilBeams() );
-
   // Each phase (e.g., transpose, PPF, correlator) reads from an input data
   // set and writes to an output data set.  To save memory, two memory buffers
   // are used, and consecutive phases alternately use one of them as input
@@ -295,14 +292,14 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
   // Allocators for a single arena.
 
   if (itsIsTransposeInput) {
-    itsInputData = new InputData<SAMPLE_TYPE>(outputPsets.size(), nrSamplesToCNProc);
+    itsInputData = new InputData<SAMPLE_TYPE>(outputPsets.size(), nrSamplesToCNProc, itsNrPencilBeams);
   }
 
   if (itsIsTransposeOutput) {
     // create only the data structures that are used by the pipeline
 
-    itsTransposedData = new TransposedData<SAMPLE_TYPE>(itsNrStations, nrSamplesToCNProc);
-    itsFilteredData   = new FilteredData(itsNrStations, nrChannels, nrSamplesPerIntegration);
+    itsTransposedData = new TransposedData<SAMPLE_TYPE>(itsNrStations, nrSamplesToCNProc, itsNrPencilBeams);
+    itsFilteredData   = new FilteredData(itsNrStations, nrChannels, nrSamplesPerIntegration, itsNrPencilBeams);
 
     switch( itsMode.mode() ) {
       case CN_Mode::FILTER:
@@ -314,17 +311,17 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
         break;
 
       case CN_Mode::COHERENT_COMPLEX_VOLTAGES:
-        itsPencilBeamData = new PencilBeamData(pencilCoordinates.size(), nrChannels, nrSamplesPerIntegration);
+        itsPencilBeamData = new PencilBeamData(itsNrPencilBeams, nrChannels, nrSamplesPerIntegration);
         break;
 
       case CN_Mode::COHERENT_STOKES_I:
       case CN_Mode::COHERENT_ALLSTOKES:
-        itsPencilBeamData = new PencilBeamData(pencilCoordinates.size(), nrChannels, nrSamplesPerIntegration);
+        itsPencilBeamData = new PencilBeamData(itsNrPencilBeams, nrChannels, nrSamplesPerIntegration);
         // fallthrough
 
       case CN_Mode::INCOHERENT_STOKES_I:
       case CN_Mode::INCOHERENT_ALLSTOKES:
-        itsStokesData     = new StokesData(itsMode.isCoherent(), itsMode.nrStokes(), pencilCoordinates.size(), nrChannels, nrSamplesPerIntegration, nrSamplesPerStokesIntegration);
+        itsStokesData     = new StokesData(itsMode.isCoherent(), itsMode.nrStokes(), itsNrPencilBeams, nrChannels, nrSamplesPerIntegration, nrSamplesPerStokesIntegration);
         break;
 
       default:
@@ -337,7 +334,7 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
     }
 
     if( itsStokesIntegrateChannels ) {
-      itsStokesDataIntegratedChannels = new StokesDataIntegratedChannels(itsMode.isCoherent(), itsMode.nrStokes(), pencilCoordinates.size(), nrSamplesPerIntegration, nrSamplesPerStokesIntegration);
+      itsStokesDataIntegratedChannels = new StokesDataIntegratedChannels(itsMode.isCoherent(), itsMode.nrStokes(), itsNrPencilBeams, nrSamplesPerIntegration, nrSamplesPerStokesIntegration);
     }
   }
 
@@ -359,7 +356,6 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
   // create the arenas and allocate the data sets
   itsMapping.allocate();
 
-  
   if (itsIsTransposeOutput) {
     const unsigned logicalNode	= usedCoresPerPset * (outputPsetIndex - outputPsets.begin()) + myCore;
     // TODO: logicalNode assumes output psets are consecutively numbered
@@ -376,7 +372,7 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
 
     itsPPF	      = new PPF<SAMPLE_TYPE>(itsNrStations, nrChannels, nrSamplesPerIntegration, configuration.sampleRate() / nrChannels, configuration.delayCompensation(), itsLocationInfo.rank() == 0);
 
-    itsPencilBeamFormer  = new PencilBeams(pencilCoordinates, itsNrStations, nrChannels, nrSamplesPerIntegration, itsCenterFrequencies[itsCurrentSubband], configuration.sampleRate() / nrChannels, configuration.refPhaseCentre(), configuration.phaseCentres() );
+    itsPencilBeamFormer  = new PencilBeams(itsNrPencilBeams, itsNrStations, nrChannels, nrSamplesPerIntegration, itsCenterFrequencies[itsCurrentSubband], configuration.sampleRate() / nrChannels );
     itsStokes            = new Stokes(itsMode.isCoherent(), itsMode.nrStokes(), nrChannels, nrSamplesPerIntegration, nrSamplesPerStokesIntegration );
     itsIncoherentStokesI = new Stokes(false, 1, nrChannels, nrSamplesPerIntegration, nrSamplesPerStokesIntegration );
 
@@ -388,7 +384,7 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::preprocess(CN_C
   if (itsIsTransposeInput || itsIsTransposeOutput) {
     itsAsyncTranspose = new AsyncTranspose<SAMPLE_TYPE>(itsIsTransposeInput, itsIsTransposeOutput, 
 							myCore, itsLocationInfo, inputPsets, outputPsets, 
-							nrSamplesToCNProc, itsNrSubbands, itsNrSubbandsPerPset);
+							itsNrSubbands, itsNrSubbandsPerPset, itsNrPencilBeams);
   }
 #endif // HAVE_MPI
 }
@@ -529,7 +525,7 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::calculateCohere
     LOG_DEBUG(std::setprecision(12) << "core " << itsLocationInfo.rank() << ": start calculating coherent Stokes at " << MPI_Wtime());
 #endif // HAVE_MPI
   computeTimer.start();
-  itsStokes->calculateCoherent(itsPencilBeamData,itsStokesData,itsPencilBeamFormer->nrCoordinates());
+  itsStokes->calculateCoherent(itsPencilBeamData,itsStokesData,itsNrPencilBeams);
   computeTimer.stop();
 }
 
@@ -609,7 +605,7 @@ template <typename SAMPLE_TYPE> void CN_Processing<SAMPLE_TYPE>::process()
         formPencilBeams();
         calculateCoherentStokes();
 	if( itsStokesIntegrateChannels ) {
-	  itsStokes->compressStokes( itsStokesData, itsStokesDataIntegratedChannels, itsPencilBeamFormer->nrCoordinates() );
+	  itsStokes->compressStokes( itsStokesData, itsStokesDataIntegratedChannels, itsNrPencilBeams );
           sendOutput( itsStokesDataIntegratedChannels );
 	} else {
           sendOutput( itsStokesData );
diff --git a/RTCP/CNProc/src/CN_Processing.h b/RTCP/CNProc/src/CN_Processing.h
index d7f5487eeb4e6c45b272edd9cf61fff0a4feb792..b686e639ede26541df37f6390bf392ff6c25f8e0 100644
--- a/RTCP/CNProc/src/CN_Processing.h
+++ b/RTCP/CNProc/src/CN_Processing.h
@@ -117,6 +117,7 @@ template <typename SAMPLE_TYPE> class CN_Processing : public CN_Processing_Base,
 #endif
 
     unsigned            itsNrStations;
+    unsigned            itsNrPencilBeams;
     unsigned            itsNrSubbands;
     unsigned            itsNrSubbandsPerPset;
     unsigned            itsComputeGroupRank;
diff --git a/RTCP/CNProc/src/CN_Processing_main.cc b/RTCP/CNProc/src/CN_Processing_main.cc
index ec84d9ea1106d3e834c6e62e5e297261da11f584..a01f4a5f650a5fd301ea11fa2bdfb3e05b96782e 100644
--- a/RTCP/CNProc/src/CN_Processing_main.cc
+++ b/RTCP/CNProc/src/CN_Processing_main.cc
@@ -48,7 +48,7 @@
 
 // if exceptions are not caught, an attempt is made to create a backtrace
 // from the place where the exception is thrown.
-#undef CATCH_EXCEPTIONS
+#define CATCH_EXCEPTIONS
 
 
 using namespace LOFAR;
diff --git a/RTCP/CNProc/src/InputData.h b/RTCP/CNProc/src/InputData.h
index 900886dae08d6d469da9ce4ec4838f227719a96d..27e158178d23c3771af7ae8cdbebf2ad35be6870 100644
--- a/RTCP/CNProc/src/InputData.h
+++ b/RTCP/CNProc/src/InputData.h
@@ -23,7 +23,7 @@ template <typename SAMPLE_TYPE> class InputData: public SampleData<SAMPLE_TYPE,3
   public:
     typedef SampleData<SAMPLE_TYPE,3> SuperType;
 
-    InputData(const unsigned nrSubbands, const unsigned nrSamplesToCNProc);
+    InputData(const unsigned nrSubbands, const unsigned nrSamplesToCNProc, const unsigned nrPencilBeams);
 
     virtual void allocate( Allocator &allocator = heapAllocator );
 
@@ -36,31 +36,38 @@ template <typename SAMPLE_TYPE> class InputData: public SampleData<SAMPLE_TYPE,3
 
   private:
     const unsigned	    itsNrSubbands;
+    const unsigned	    itsNrPencilBeams;
 
   public:
     Vector<SubbandMetaData> metaData; //[outputPsets.size()]
 };
 
 
-template <typename SAMPLE_TYPE> inline InputData<SAMPLE_TYPE>::InputData(const unsigned nrSubbands, const unsigned nrSamplesToCNProc)
+template <typename SAMPLE_TYPE> inline InputData<SAMPLE_TYPE>::InputData(const unsigned nrSubbands, const unsigned nrSamplesToCNProc, const unsigned nrPencilBeams)
 :
   SuperType( false, boost::extents[nrSubbands][nrSamplesToCNProc][NR_POLARIZATIONS], 0 ),
-  itsNrSubbands(nrSubbands)
+  itsNrSubbands(nrSubbands),
+  itsNrPencilBeams(nrPencilBeams)
 {
 }
 
 template <typename SAMPLE_TYPE> inline void InputData<SAMPLE_TYPE>::allocate( Allocator &allocator )
 {
   SuperType::allocate( allocator );
-  metaData.resize( itsNrSubbands );
-}
+  metaData.resize( itsNrSubbands, 32 );
 
+  for( unsigned i = 0; i < itsNrSubbands; i++ ) {
+    metaData[i] = SubbandMetaData( itsNrPencilBeams );
+  }
+}
 
 // used for asynchronous transpose
 template <typename SAMPLE_TYPE> inline void InputData<SAMPLE_TYPE>::readMetaData(Stream *str)
 {
   // read all metadata
-  str->read(&metaData[0], metaData.size() * sizeof(SubbandMetaData));
+  for( unsigned subband = 0; subband < itsNrSubbands; subband++ ) {
+    metaData[subband].read( str );
+  }
 }
 
 // used for asynchronous transpose
@@ -76,7 +83,7 @@ template <typename SAMPLE_TYPE> inline void InputData<SAMPLE_TYPE>::readOne(Stre
 template <typename SAMPLE_TYPE> inline void InputData<SAMPLE_TYPE>::readAll(Stream *str)
 {
   // read all metadata
-  str->read(&metaData[0], metaData.size() * sizeof(SubbandMetaData));
+  readMetaData( str );
 
   // now read all subbands using one recvBlocking call, even though the ION
   // sends all subbands one at a time
diff --git a/RTCP/CNProc/src/PPF.cc b/RTCP/CNProc/src/PPF.cc
index eb6a135c1536288d145f5a6cee53d76bc7dae05c..d41f4cd6a44176102c5ffaa727cb29b5f4204e12 100644
--- a/RTCP/CNProc/src/PPF.cc
+++ b/RTCP/CNProc/src/PPF.cc
@@ -262,7 +262,7 @@ template <typename SAMPLE_TYPE> void PPF<SAMPLE_TYPE>::filter(unsigned stat, dou
   _bgl_mutex_lock(mutex);
 #endif
 
-  unsigned alignmentShift = transposedData->metaData[stat].alignmentShift;
+  const unsigned alignmentShift = transposedData->metaData[stat].alignmentShift();
 
 #if 0
   LOG_DEBUG_STR(setprecision(15) << "stat " << stat << ", basefreq " << baseFrequency << ": delay from " << delays[stat].delayAtBegin << " to " << delays[stat].delayAfterEnd << " sec");
@@ -348,12 +348,11 @@ template <typename SAMPLE_TYPE> void PPF<SAMPLE_TYPE>::filter(unsigned stat, dou
 
   struct phase_shift phaseShifts[itsNrSamplesPerIntegration];
 
-  computePhaseShifts(phaseShifts, transposedData->metaData[stat].delayAtBegin, transposedData->metaData[stat].delayAfterEnd, baseFrequency);
+  computePhaseShifts(phaseShifts, transposedData->metaData[stat].beams[0].delayAtBegin, transposedData->metaData[stat].beams[0].delayAfterEnd, baseFrequency);
 
   // forward (pencil) beam forming information
-  for( unsigned i = 0; i < 3; i++ ) {
-    filteredData->metaData[stat].beamDirectionAtBegin[i] = transposedData->metaData[stat].beamDirectionAtBegin[i];
-    filteredData->metaData[stat].beamDirectionAfterEnd[i] = transposedData->metaData[stat].beamDirectionAfterEnd[i];
+  for( unsigned p = 0; p < filteredData->metaData[stat].beams.size(); p++ ) {
+    filteredData->metaData[stat].beams[p] = transposedData->metaData[stat].beams[p];
   }
 
   const SparseSet<unsigned>::Ranges &ranges = filteredData->flags[stat].getRanges();
diff --git a/RTCP/CNProc/src/PencilBeams.cc b/RTCP/CNProc/src/PencilBeams.cc
index 38e6be799ff3a080585acb8af93c596157141d01..e2427185ac5b4799122fac87f85fad1dc5d3fcbd 100644
--- a/RTCP/CNProc/src/PencilBeams.cc
+++ b/RTCP/CNProc/src/PencilBeams.cc
@@ -6,11 +6,14 @@
 #include <Interface/MultiDimArray.h>
 #include <Interface/PrintVector.h>
 #include <Interface/Exceptions.h>
+#include <Interface/SubbandMetaData.h>
 #include <Common/Timer.h>
+#include <Common/LofarLogger.h>
 #include <iostream>
 #include <cmath>
 #include <cassert>
 #include <vector>
+#include <cstring>
 
 #ifndef PENCILBEAMS_C_IMPLEMENTATION
   #include <BeamFormerAsm.h>
@@ -25,63 +28,17 @@ namespace RTCP {
 
 static NSTimer pencilBeamFormTimer("PencilBeamFormer::formBeams()", true, true);
 
-PencilBeams::PencilBeams(PencilCoordinates &coordinates, const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration, const double centerFrequency, const double channelBandwidth, const std::vector<double> &refPhaseCentre, const Matrix<double> &phaseCentres )
+PencilBeams::PencilBeams(const unsigned nrPencilBeams, const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration, const double centerFrequency, const double channelBandwidth )
 :
-  itsCoordinates(coordinates.getCoordinates()),
   itsNrStations(nrStations),
+  itsNrPencilBeams(nrPencilBeams),
   itsNrChannels(nrChannels),
   itsNrSamplesPerIntegration(nrSamplesPerIntegration),
   itsCenterFrequency(centerFrequency),
   itsChannelBandwidth(channelBandwidth),
   itsBaseFrequency(centerFrequency - (nrChannels/2) * channelBandwidth),
-  itsRefPhaseCentre(refPhaseCentre)
+  itsDelays( nrStations, nrPencilBeams )
 {
-  // copy all phase centres and their derived constants
-  itsPhaseCentres.reserve( nrStations );
-  itsBaselines.reserve( nrStations );
-  itsBaselinesSeconds.reserve( nrStations );
-  itsDelays.resize( nrStations, itsCoordinates.size() );
-  itsDelayOffsets.resize( nrStations, itsCoordinates.size() );
-  for( unsigned stat = 0; stat < nrStations; stat++ ) {
-     const double x = phaseCentres[stat][0];
-     const double y = phaseCentres[stat][1];
-     const double z = phaseCentres[stat][2];
-
-     PencilCoord3D phaseCentre( x, y, z );
-     PencilCoord3D baseLine = phaseCentre - refPhaseCentre;
-     PencilCoord3D baseLineSeconds = baseLine * (1.0/speedOfLight);
-
-     itsPhaseCentres.push_back( phaseCentre );
-     itsBaselines.push_back( baseLine );
-     itsBaselinesSeconds.push_back( baseLineSeconds );
-
-     for( unsigned beam = 0; beam < itsCoordinates.size(); beam++ ) {
-       itsDelayOffsets[stat][beam] = baseLine * itsCoordinates[beam] * (1.0/speedOfLight);
-     }
-  }
-}
-
-void PencilBeams::calculateDelays( unsigned stat, const PencilCoord3D &beamDir )
-{
-  const double compensatedDelay = itsDelayOffsets[stat][0] - itsBaselinesSeconds[stat] * beamDir;
-
-  //LOG_DEBUG_STR("station " << stat << " beam 0 has an absolute delay of  " << compensatedDelay);
-
-  // centre beam does not need compensation
-  itsDelays[stat][0] = 0.0;
-
-  for( unsigned i = 1; i < itsCoordinates.size(); i++ ) {
-     // delay[i] = baseline * (coordinate - beamdir) / c
-     //          = (baseline * coordinate / c) - (baseline * beamdir) / c
-     //          = delayoffset - baselinesec * beamdir
-     //
-     // further reduced by the delay we already compensate for when doing regular beam forming (the centre of the beam). that compensation is done at the IONode (sample shift) and the PPF (phase shift)
-     itsDelays[stat][i] = itsDelayOffsets[stat][i] - itsBaselinesSeconds[stat] * beamDir - compensatedDelay;
-
-     //LOG_DEBUG_STR("station " << stat << " beam " << i << "has an additional delay of " << itsDelays[stat][i]);
-     //LOG_DEBUG_STR(itsDelayOffsets[stat][i] << " " << itsBaselinesSeconds[stat] << " " << beamDir << " " << compensatedDelay);
-     //LOG_DEBUG_STR("example shift: " << phaseShift( itsBaseFrequency + itsNrChannels/2*itsChannelBandwidth, itsDelays[stat][i] ));
-  }
 }
 
 #ifdef PENCILBEAMS_C_IMPLEMENTATION
@@ -148,18 +105,20 @@ void PencilBeams::computeComplexVoltages( const FilteredData *in, PencilBeamData
 #else // ASM implementation
 
 // what we can process in one go
-static const unsigned NRSTATIONS = 3;
-static const unsigned NRBEAMS = 6;
+static const unsigned NRSTATIONS = 6;
+static const unsigned NRBEAMS = 3;
 #define BEAMFORMFUNC _beamform_up_to_6_stations_and_3_beams
+#define ADDFUNC(nr)  _add_ ## nr ## _single_precision_vectors
 
 // the number of samples to do in one go, such that the
 // caches are optimally used. itsNrSamplesPerIntegration needs
-// to be a multiple of this
+// to be a multiple of this.
+// Also, TIMESTEPSIZE needs to be a multiple of 16, as the assembly code requires it
 static const unsigned TIMESTEPSIZE = 128;
 
 void PencilBeams::computeComplexVoltages( const FilteredData *in, PencilBeamData *out )
 {
-  const unsigned nrBeams = itsCoordinates.size();
+  ASSERT( TIMESTEPSIZE % 16 == 0 );
 
   if( itsNrSamplesPerIntegration % TIMESTEPSIZE > 0 ) {
     THROW(CNProcException, "nrSamplesPerIntegration (" << itsNrSamplesPerIntegration << ") needs to be a multiple of " << TIMESTEPSIZE );
@@ -187,7 +146,7 @@ void PencilBeams::computeComplexVoltages( const FilteredData *in, PencilBeamData
   }
 
   // conservative flagging: flag output if any input was flagged 
-  for( unsigned beam = 0; beam < nrBeams; beam++ ) {
+  for( unsigned beam = 0; beam < itsNrPencilBeams; beam++ ) {
     out->flags[beam].reset();
 
     for (unsigned stat = 0; stat < itsNrStations; stat++ ) {
@@ -205,23 +164,21 @@ void PencilBeams::computeComplexVoltages( const FilteredData *in, PencilBeamData
     const float factor = averagingFactor; // add multiplication factors as needed
 
     // construct the weights, with zeroes for unused data
-    fcomplex weights[itsNrStations][nrBeams] __attribute__ ((aligned(32)));
+    fcomplex weights[itsNrStations][itsNrPencilBeams] __attribute__ ((aligned(128)));
 
     for( unsigned s = 0; s < itsNrStations; s++ ) {
       if( validStation[s] ) {
-        for( unsigned b = 0; b < nrBeams; b++ ) {
+        for( unsigned b = 0; b < itsNrPencilBeams; b++ ) {
           weights[s][b] = phaseShift( frequency, itsDelays[s][b] ) * factor;
         }
       } else {
         // a dropped station has a weight of 0
-        for( unsigned b = 0; b < nrBeams; b++ ) {
+        for( unsigned b = 0; b < itsNrPencilBeams; b++ ) {
           weights[s][b] = makefcomplex( 0, 0 );
         }
       }
     }
 
-    // TODO: separate construction of central beam
-
     unsigned processBeams = NRBEAMS;
     unsigned processStations = NRSTATIONS;
 
@@ -233,104 +190,69 @@ void PencilBeams::computeComplexVoltages( const FilteredData *in, PencilBeamData
 
       for( unsigned time = 0; time < itsNrSamplesPerIntegration; time += TIMESTEPSIZE ) {
         // central beam (#0) has no weights, we can simply add the stations
+
 	switch( processStations ) {
 	  case 0:
 	  default:
 	    THROW(CNProcException,"Requested to add " << processStations << " stations, but can only add 1-6.");
 	    break;
 
+// possible inputs
+#define OUTPUT		(reinterpret_cast<float*>(out->samples[ch][0][time].origin()))
+#define STATION(nr)	(reinterpret_cast<const float*>(in->samples[ch][stat+nr][time].origin()))
+
+// shorthand for the add functions
+#define ADDGENERIC(nr,...)	ADDFUNC(nr)(						\
+	        OUTPUT,									\
+		__VA_ARGS__,								\
+	        TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */ )
+
+// adds stations, and the subtotal if needed (if stat!=0)
+#define ADD(nr,nrplusone,...)	do {							\
+				  if( stat ) {						\
+				    ADDGENERIC(nrplusone,OUTPUT,__VA_ARGS__);		\
+				  } else {						\
+				    ADDGENERIC(nr,__VA_ARGS__);				\
+				  }							\
+				} while(0);
+
 	  case 1:
-	    // nothing to add, so just copy the values
-	    memcpy( out->samples[ch][0][time].origin(), in->samples[ch][stat+0][time].origin(),
-              TIMESTEPSIZE * NR_POLARIZATIONS * sizeof out->samples[0][0][0][0] );
+            ADD( 1, 2, STATION(0) );
 	    break;
 
 	  case 2:
-	    _add_2_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+0][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+1][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
+            ADD( 2, 3, STATION(0), STATION(1) );
 	    break;
 
 	  case 3:
-	    _add_3_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+0][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+1][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+2][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
+            ADD( 3, 4, STATION(0), STATION(1), STATION(2) );
 	    break;
 
 	  case 4:
-	    _add_4_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+0][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+1][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+2][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+3][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
+            ADD( 4, 5, STATION(0), STATION(1), STATION(2), STATION(3) );
 	    break;
 
 	  case 5:
-	    _add_4_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+0][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+1][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+2][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+3][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
-
-	    _add_2_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-	      reinterpret_cast<const float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+4][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
+            ADD( 5, 6, STATION(0), STATION(1), STATION(2), STATION(3), STATION(4) );
 	    break;
-#if 0
-	  case 5:
-	    _add_5_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+0][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+1][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+2][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+3][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+4][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
-	    break;
-#endif
+
 	  case 6:
-	    _add_6_single_precision_vectors(
-	      reinterpret_cast<float*>(out->samples[ch][0][time].origin()), 
-              reinterpret_cast<const float*>(in->samples[ch][stat+0][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+1][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+2][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+3][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+4][time].origin()),
-              reinterpret_cast<const float*>(in->samples[ch][stat+5][time].origin()),
-	      TIMESTEPSIZE * NR_POLARIZATIONS * 2 /* 2 for real & imaginary parts */
-	      );
+            ADD( 6, 7, STATION(0), STATION(1), STATION(2), STATION(3), STATION(4), STATION(5) );
 	    break;
 	}
 
 	// non-central beams
-        for( unsigned beam = 1; beam < nrBeams; beam += processBeams ) {
-          processBeams = std::min( NRBEAMS, nrBeams - beam ); 
+        for( unsigned beam = 1; beam < itsNrPencilBeams; beam += processBeams ) {
+          processBeams = std::min( NRBEAMS, itsNrPencilBeams - beam ); 
 
           // beam form
 	  // note: PPF.cc puts zeroes at flagged samples, so we can just add them
           BEAMFORMFUNC(
             out->samples[ch][beam][time].origin(),
-            out->samples[ch][beam].strides()[0] * sizeof out->samples[0][0][0][0],
+            out->samples[ch].strides()[0] * sizeof out->samples[0][0][0][0],
 
             in->samples[ch][stat][time].origin(),
-            in->samples[ch][stat].strides()[0] * sizeof in->samples[0][0][0][0],
+            in->samples[ch].strides()[0] * sizeof in->samples[0][0][0][0],
 
             &weights[stat][beam],
             (&weights[1][0] - &weights[0][0]) * sizeof weights[0][0],
@@ -350,16 +272,23 @@ void PencilBeams::computeComplexVoltages( const FilteredData *in, PencilBeamData
 
 #endif
 
-void PencilBeams::calculateAllDelays( const FilteredData *filteredData )
+void PencilBeams::calculateDelays( const FilteredData *filteredData )
 {
-  // calculate the delays for each station for this integration period
+  // Calculate the delays for each station for this integration period.
+
+  // We assume that the delay compensation has already occurred for the central beam. Also,
+  // we use the same delay for all time samples. This could be interpolated for TIMESTEPSIZE
+  // portions, as used in computeComplexVoltages.
+
   for( unsigned stat = 0; stat < itsNrStations; stat++ ) {
-    // todo: interpolate per time step?
-    PencilCoord3D beamDirBegin = filteredData->metaData[stat].beamDirectionAtBegin;
-    PencilCoord3D beamDirEnd = filteredData->metaData[stat].beamDirectionAfterEnd;
-    PencilCoord3D beamDirAverage = (beamDirBegin + beamDirEnd) * 0.5;
+    for( unsigned pencil = 0; pencil < itsNrPencilBeams; pencil++ ) {
+      const SubbandMetaData::beamInfo &beamInfo = filteredData->metaData[stat].beams[pencil];
 
-    calculateDelays( stat, beamDirAverage );
+      itsDelays[stat][pencil] = (beamInfo.delayAfterEnd - beamInfo.delayAtBegin) * 0.5;
+
+      // subtract the delay that was already compensated for (i.e. the central beam)
+      itsDelays[stat][pencil] -= itsDelays[stat][0];
+    }
   }
 }
 
@@ -370,7 +299,7 @@ void PencilBeams::formPencilBeams( const FilteredData *filteredData, PencilBeamD
 
   pencilBeamFormTimer.start();
 
-  calculateAllDelays( filteredData );
+  calculateDelays( filteredData );
   computeComplexVoltages( filteredData, pencilBeamData );
 
   pencilBeamFormTimer.stop();
diff --git a/RTCP/CNProc/src/PencilBeams.h b/RTCP/CNProc/src/PencilBeams.h
index d6ef72bc009fd04e9e2a708c7b66b83d2f00a4cd..fd7eed8dcfab7f8fcedb2ef40083da31231b9672 100644
--- a/RTCP/CNProc/src/PencilBeams.h
+++ b/RTCP/CNProc/src/PencilBeams.h
@@ -16,40 +16,33 @@
 namespace LOFAR {
 namespace RTCP {
 
+const double speedOfLight = 299792458;
 
 class PencilBeams
 {
   public:
-    static const double speedOfLight = 299792458;
     static const float MAX_FLAGGED_PERCENTAGE = 0.9f;
 
-    PencilBeams(PencilCoordinates &coordinates, const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration, const double centerFrequency, const double channelBandwidth, const std::vector<double> &refPhaseCentre, const Matrix<double> &phaseCentres );
+    PencilBeams(const unsigned nrPencilBeams, const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration, const double centerFrequency, const double channelBandwidth );
 
     void formPencilBeams( const FilteredData *filteredData, PencilBeamData *pencilBeamData );
 
-    size_t nrCoordinates() const { return itsCoordinates.size(); }
-
   private:
+    void calculateDelays( const FilteredData *filteredData );
+
     fcomplex phaseShift( const float frequency, const float delay ) const;
     void computeBeams( const MultiDimArray<fcomplex,4> &in, MultiDimArray<fcomplex,4> &out, const std::vector<unsigned> stations );
-    void calculateDelays( const unsigned stat, const PencilCoord3D &beamDir );
-    void calculateAllDelays( const FilteredData *filteredData );
 
     void computeComplexVoltages( const FilteredData *filteredData, PencilBeamData *pencilBeamData );
 
-    std::vector<PencilCoord3D> itsCoordinates;
     const unsigned          itsNrStations;
+    const unsigned          itsNrPencilBeams;
     const unsigned          itsNrChannels;
     const unsigned          itsNrSamplesPerIntegration;
     const double            itsCenterFrequency;
     const double            itsChannelBandwidth;
     const double            itsBaseFrequency;
-    Matrix<double>          itsDelays; // [itsNrStations][itsCoordinates.size()]
-    Matrix<double>          itsDelayOffsets; // [itsNrStations][itsCoordinates.size()]
-    PencilCoord3D           itsRefPhaseCentre;
-    std::vector<PencilCoord3D> itsPhaseCentres;
-    std::vector<PencilCoord3D> itsBaselines;
-    std::vector<PencilCoord3D> itsBaselinesSeconds;
+    Matrix<double>          itsDelays; // [itsNrStations][itsNrPencilBeams]
 };
 
 inline fcomplex PencilBeams::phaseShift( const float frequency, const float delay ) const
diff --git a/RTCP/CNProc/src/TransposedData.h b/RTCP/CNProc/src/TransposedData.h
index 8aad180da8f27c62e713311f3728b62ebfc802e5..dccea444c4270d6e620e58434271bcc2b14e8d6a 100644
--- a/RTCP/CNProc/src/TransposedData.h
+++ b/RTCP/CNProc/src/TransposedData.h
@@ -3,6 +3,7 @@
 
 #include <Common/lofar_complex.h>
 #include <Interface/Align.h>
+#include <Common/LofarLogger.h>
 #include <Interface/Config.h>
 #include <Interface/MultiDimArray.h>
 #include <Interface/StreamableData.h>
@@ -19,22 +20,23 @@ template <typename SAMPLE_TYPE> class TransposedData: public SampleData<SAMPLE_T
   public:
     typedef SampleData<SAMPLE_TYPE,3> SuperType;
 
-    TransposedData(const unsigned nrStations, const unsigned nrSamplesToCNProc);
+    TransposedData(const unsigned nrStations, const unsigned nrSamplesToCNProc, const unsigned nrPencilBeams);
 
-    virtual size_t requiredSize() const;
     virtual void allocate( Allocator &allocator = heapAllocator );
 
     Vector<SubbandMetaData> metaData; //[itsNrStations]
   private:
     const unsigned		itsNrStations;
+    const unsigned		itsNrPencilBeams;
     const unsigned		itsNrSamplesToCNProc;
 };
 
 
-template <typename SAMPLE_TYPE> inline TransposedData<SAMPLE_TYPE>::TransposedData(const unsigned nrStations, const unsigned nrSamplesToCNProc)
+template <typename SAMPLE_TYPE> inline TransposedData<SAMPLE_TYPE>::TransposedData(const unsigned nrStations, const unsigned nrSamplesToCNProc, const unsigned nrPencilBeams)
 :
   SuperType(false,boost::extents[nrStations][nrSamplesToCNProc][NR_POLARIZATIONS],0),
   itsNrStations(nrStations),
+  itsNrPencilBeams(nrPencilBeams),
   itsNrSamplesToCNProc(nrSamplesToCNProc)
 {
 }
@@ -42,13 +44,11 @@ template <typename SAMPLE_TYPE> inline TransposedData<SAMPLE_TYPE>::TransposedDa
 template <typename SAMPLE_TYPE> inline void TransposedData<SAMPLE_TYPE>::allocate( Allocator &allocator )
 {
   SuperType::allocate( allocator );
-  metaData.resize(itsNrStations, 32, allocator);
-}
+  metaData.resize(itsNrStations, 32);
 
-template <typename SAMPLE_TYPE> inline size_t TransposedData<SAMPLE_TYPE>::requiredSize() const
-{
-  return SuperType::requiredSize() +
-	 align(sizeof(SubbandMetaData) * itsNrStations, 32);
+  for( unsigned i = 0; i < itsNrStations; i++ ) {
+    metaData[i] = SubbandMetaData( itsNrPencilBeams );
+  }
 }
 
 } // namespace RTCP
diff --git a/RTCP/IONProc/src/InputSection.cc b/RTCP/IONProc/src/InputSection.cc
index 2273064117736f4d794cecd4e2f80bf1343fc2ba..d6df2142d5526142d124bda2eca131b90b64369c 100644
--- a/RTCP/IONProc/src/InputSection.cc
+++ b/RTCP/IONProc/src/InputSection.cc
@@ -134,6 +134,7 @@ template<typename SAMPLE_TYPE> void InputSection<SAMPLE_TYPE>::preprocess(const
   itsNSubbandsPerPset	= ps->nrSubbandsPerPset();
   itsNSamplesPerSec	= ps->nrSubbandSamples();
   itsNrBeams		= ps->nrBeams();
+  itsNrPencilBeams	= ps->nrPencilBeams();
   itsNrPsets		= ps->nrPsets();
 
 #if defined DUMP_RAW_DATA
@@ -166,10 +167,10 @@ template<typename SAMPLE_TYPE> void InputSection<SAMPLE_TYPE>::preprocess(const
   itsSyncedStamp = TimeStamp(seconds, samples);
  
   if (itsNeedDelays) {
-    itsDelaysAtBegin.resize(itsNrBeams);
-    itsDelaysAfterEnd.resize(itsNrBeams);
-    itsBeamDirectionsAtBegin.resize(itsNrBeams);
-    itsBeamDirectionsAfterEnd.resize(itsNrBeams);
+    itsDelaysAtBegin.resize(itsNrBeams,itsNrPencilBeams);
+    itsDelaysAfterEnd.resize(itsNrBeams,itsNrPencilBeams);
+    itsBeamDirectionsAtBegin.resize(itsNrBeams,itsNrPencilBeams);
+    itsBeamDirectionsAfterEnd.resize(itsNrBeams,itsNrPencilBeams);
     
     itsDelayComp = new WH_DelayCompensation(ps, inputs[0].station, itsSyncedStamp); // TODO: support more than one station
 
@@ -186,8 +187,8 @@ template<typename SAMPLE_TYPE> void InputSection<SAMPLE_TYPE>::preprocess(const
   itsBBuffers.resize(itsNrInputs);
   itsDelayedStamps.resize(itsNrBeams);
   itsSamplesDelay.resize(itsNrBeams);
-  itsFineDelaysAtBegin.resize(itsNrBeams);
-  itsFineDelaysAfterEnd.resize(itsNrBeams);
+  itsFineDelaysAtBegin.resize(itsNrBeams,itsNrPencilBeams);
+  itsFineDelaysAfterEnd.resize(itsNrBeams,itsNrPencilBeams);
   itsFlags.resize(boost::extents[itsNrInputs][itsNrBeams]);
 
   for (unsigned rsp = 0; rsp < itsNrInputs; rsp ++)
@@ -236,24 +237,33 @@ template<typename SAMPLE_TYPE> void InputSection<SAMPLE_TYPE>::computeDelays()
     for (unsigned beam = 0; beam < itsNrBeams; beam ++) {
       // The coarse delay is determined for the center of the current
       // time interval and is expressed in an entire amount of samples.
-      const signed int coarseDelay = static_cast<signed int>(floor(0.5 * (itsDelaysAtBegin[beam] + itsDelaysAfterEnd[beam]) * itsSampleRate + 0.5));
-    
+      //
+      // We use the central pencil beam (#0) for the coarse delay compensation.
+      const signed int coarseDelay = static_cast<signed int>(floor(0.5 * (itsDelaysAtBegin[beam][0] + itsDelaysAfterEnd[beam][0]) * itsSampleRate + 0.5));
+
       // The fine delay is determined for the boundaries of the current
       // time interval and is expressed in seconds.
       const double d = coarseDelay * itsSampleDuration;
-    
+
       itsDelayedStamps[beam]      -= coarseDelay;
       itsSamplesDelay[beam]       = -coarseDelay;
-      itsFineDelaysAtBegin[beam]  = static_cast<float>(itsDelaysAtBegin[beam] - d);
-      itsFineDelaysAfterEnd[beam] = static_cast<float>(itsDelaysAfterEnd[beam] - d);
+
+      for (unsigned pencil = 0; pencil < itsNrPencilBeams; pencil++ ) {
+        // we don't do coarse delay compensation for the individual pencil beams to avoid complexity and overhead
+        itsFineDelaysAtBegin[beam][pencil]  = static_cast<float>(itsDelaysAtBegin[beam][pencil] - d);
+        itsFineDelaysAfterEnd[beam][pencil] = static_cast<float>(itsDelaysAfterEnd[beam][pencil] - d);
+      }
     }
 
     itsDelayTimer.stop();
   } else {
     for (unsigned beam = 0; beam < itsNrBeams; beam ++) {
       itsSamplesDelay[beam]       = 0;
-      itsFineDelaysAtBegin[beam]  = 0;
-      itsFineDelaysAfterEnd[beam] = 0;
+
+      for (unsigned pencil = 0; pencil < itsNrPencilBeams; pencil++ ) {
+        itsFineDelaysAtBegin[beam][pencil]  = 0;
+        itsFineDelaysAfterEnd[beam][pencil] = 0;
+      }
     }  
   }
 }
@@ -291,7 +301,7 @@ template<typename SAMPLE_TYPE> void InputSection<SAMPLE_TYPE>::writeLogMessage()
 
   if (itsDelayCompensation) {
     for (unsigned beam = 0; beam < itsNrBeams; beam ++)
-      logStr << (beam == 0 ? ", delays: [" : ", ") << PrettyTime(itsDelaysAtBegin[beam], 7);
+      logStr << (beam == 0 ? ", delays: [" : ", ") << PrettyTime(itsDelaysAtBegin[beam][0], 7);
       //logStr << (beam == 0 ? ", delays: [" : ", ") << PrettyTime(itsDelaysAtBegin[beam], 7) << " = " << itsSamplesDelay[beam] << " samples + " << PrettyTime(itsFineDelaysAtBegin[beam], 7);
 
     logStr << "]";
@@ -319,32 +329,43 @@ template<typename SAMPLE_TYPE> void InputSection<SAMPLE_TYPE>::toComputeNodes()
     command.write(stream);
 
     // create and send all metadata in one "large" message
-    std::vector<SubbandMetaData, AlignedStdAllocator<SubbandMetaData, 16> > metaData(itsNrPsets);
-
+    Vector<SubbandMetaData> metaData( itsNrPsets, 16, heapAllocator );
+    for( unsigned i = 0; i < itsNrPsets; i++ ) {
+      metaData[i] = SubbandMetaData( itsNrPencilBeams );
+    }
+    
     if( itsNeedDelays ) {
       for (unsigned pset = 0; pset < itsNrPsets; pset ++) {
         unsigned subband  = itsNSubbandsPerPset * pset + subbandBase;
+
 	if(subband < itsNSubbands) {
 	  unsigned rspBoard = itsSubbandToRSPboardMapping[subband];
 	  unsigned beam     = itsSubbandToBeamMapping[subband];
-	  const vector<double> &beamDirBegin = itsBeamDirectionsAtBegin[beam].coord().get();
-	  const vector<double> &beamDirEnd   = itsBeamDirectionsAfterEnd[beam].coord().get();
 
-	  metaData[pset].delayAtBegin   = itsFineDelaysAtBegin[beam];
-	  metaData[pset].delayAfterEnd  = itsFineDelaysAfterEnd[beam];
+          for( unsigned p = 0; p < itsNrPencilBeams; p++ ) {
+            struct SubbandMetaData::beamInfo &beamInfo = metaData[pset].beams[p];
 
-	  for( unsigned i = 0; i < 3; i++ ) {
-	    metaData[pset].beamDirectionAtBegin[i]  = beamDirBegin[i];
-	    metaData[pset].beamDirectionAfterEnd[i] = beamDirEnd[i];
-	  }
+	    beamInfo.delayAtBegin   = itsFineDelaysAtBegin[beam][p];
+	    beamInfo.delayAfterEnd  = itsFineDelaysAfterEnd[beam][p];
 
-	  metaData[pset].alignmentShift = itsBBuffers[rspBoard]->alignmentShift(beam);
+	    const vector<double> &beamDirBegin = itsBeamDirectionsAtBegin[beam][p].coord().get();
+	    const vector<double> &beamDirEnd   = itsBeamDirectionsAfterEnd[beam][p].coord().get();
+
+	    for( unsigned i = 0; i < 3; i++ ) {
+	      beamInfo.beamDirectionAtBegin[i]  = beamDirBegin[i];
+	      beamInfo.beamDirectionAfterEnd[i] = beamDirEnd[i];
+	    }
+	  }  
+
+	  metaData[pset].alignmentShift() = itsBBuffers[rspBoard]->alignmentShift(beam);
 	  metaData[pset].setFlags(itsFlags[rspBoard][beam]);
 	}
       }
     }
 
-    stream->write(&metaData[0], metaData.size() * sizeof(SubbandMetaData));
+    for( unsigned pset = 0; pset < itsNrPsets; pset++ ) {
+      metaData[pset].write( stream );
+    }
 
     // now send all subband data
     for (unsigned pset = 0; pset < itsNrPsets; pset ++) {
diff --git a/RTCP/IONProc/src/InputSection.h b/RTCP/IONProc/src/InputSection.h
index f13acdc1003ffb8d962e5a3a24614bcbdb893b3e..f6e6ab1f45ddd42b6d13745544ef285b3e22c67b 100644
--- a/RTCP/IONProc/src/InputSection.h
+++ b/RTCP/IONProc/src/InputSection.h
@@ -30,6 +30,7 @@
 
 //# Includes
 #include <Interface/Parset.h>
+#include <Interface/MultiDimArray.h>
 #include <Interface/RSPTimeStamp.h>
 #include <Stream/Stream.h>
 #include <BeamletBuffer.h>
@@ -85,10 +86,10 @@ template <typename SAMPLE_TYPE> class InputSection {
     
     TimeStamp			 itsSyncedStamp;
    
-    std::vector<double>		 itsDelaysAtBegin;
-    std::vector<double>		 itsDelaysAfterEnd;
-    std::vector<AMC::Direction>	 itsBeamDirectionsAtBegin;
-    std::vector<AMC::Direction>	 itsBeamDirectionsAfterEnd;
+    Matrix<double>		 itsDelaysAtBegin;
+    Matrix<double>		 itsDelaysAfterEnd;
+    Matrix<AMC::Direction>	 itsBeamDirectionsAtBegin;
+    Matrix<AMC::Direction>	 itsBeamDirectionsAfterEnd;
     unsigned			 itsNrPsets;
     
     unsigned			 itsMaxNetworkDelay; // in samples
@@ -98,6 +99,7 @@ template <typename SAMPLE_TYPE> class InputSection {
     unsigned			 itsNHistorySamples;
     unsigned			 itsNrInputs;
     unsigned			 itsNrBeams;
+    unsigned			 itsNrPencilBeams;
 
     unsigned			 itsCurrentComputeCore, itsNrCoresPerPset;
     unsigned			 itsPsetNumber;
@@ -108,9 +110,10 @@ template <typename SAMPLE_TYPE> class InputSection {
 
     std::vector<TimeStamp>	 itsDelayedStamps;
     std::vector<signed int>	 itsSamplesDelay;
-    std::vector<float>		 itsFineDelaysAtBegin, itsFineDelaysAfterEnd;
     boost::multi_array<SparseSet<unsigned>, 2> itsFlags;
 
+    Matrix<float>		 itsFineDelaysAtBegin, itsFineDelaysAfterEnd;
+
     
     LogThread			 *itsLogThread;
     NSTimer			 itsDelayTimer;
diff --git a/RTCP/IONProc/src/WH_DelayCompensation.cc b/RTCP/IONProc/src/WH_DelayCompensation.cc
index 99f52c1cd8720fe2839f7a9d6a3f9e9370def460..0f37380cf4de67fea007210f207fe85e4bdb60cc 100644
--- a/RTCP/IONProc/src/WH_DelayCompensation.cc
+++ b/RTCP/IONProc/src/WH_DelayCompensation.cc
@@ -36,6 +36,7 @@
 #include <Common/LofarLogger.h>
 #include <Common/PrettyUnits.h>
 #include <Interface/Exceptions.h>
+#include <Interface/PencilCoordinates.h>
 
 namespace LOFAR 
 {
@@ -49,15 +50,16 @@ namespace LOFAR
                                                const string &stationName,
 					       const TimeStamp &startTime) :
       stop         (false),					       
-      itsBuffer    (bufferSize,ps->nrBeams()),
+      itsBuffer    (bufferSize,ps->nrBeams() * ps->nrPencilBeams()),
       head         (0),
       tail         (0),
       bufferFree   (bufferSize),
       bufferUsed   (0),
 
       itsNrCalcDelays    (ps->nrCalcDelays()),
-      itsNrBeams   (ps->nrBeams()),
-      itsStartTime (startTime),
+      itsNrBeams         (ps->nrBeams()),
+      itsNrPencilBeams   (ps->nrPencilBeams()),
+      itsStartTime       (startTime),
       itsNrSamplesPerSec (ps->nrSubbandSamples()),
       itsSampleDuration  (ps->sampleDuration()),
       itsStationName     (stationName),
@@ -150,14 +152,14 @@ namespace LOFAR
         ResultData result;
         converter->j2000ToItrf(result, request); // expensive
 
-        ASSERTSTR(result.direction.size() == itsNrCalcDelays * itsNrBeams,
-	  	  result.direction.size() << " == " << itsNrCalcDelays * itsNrBeams);
+        ASSERTSTR(result.direction.size() == itsNrCalcDelays * itsNrBeams * itsNrPencilBeams,
+	  	  result.direction.size() << " == " << itsNrCalcDelays * itsNrBeams * itsNrPencilBeams );
 
 	// append the results to the circular buffer
 	unsigned resultIndex = 0;
 
         for( unsigned t = 0; t < itsNrCalcDelays; t++ ) {
-	  for( unsigned b = 0; b < itsNrBeams; b++ ) {
+	  for( unsigned b = 0; b < itsNrBeams * itsNrPencilBeams; b++ ) {
 	    ASSERTSTR( tail < bufferSize, tail << " < " << bufferSize );
 
 	    itsBuffer[tail][b] = result.direction[resultIndex++];
@@ -181,23 +183,25 @@ namespace LOFAR
       delete converter;
     }
 
-    void WH_DelayCompensation::getNextDelays( vector<Direction> &directions, vector<double> &delays )
+    void WH_DelayCompensation::getNextDelays( Matrix<Direction> &directions, Matrix<double> &delays )
     {
-      ASSERTSTR(directions.size() == itsNrBeams,
-                directions.size() << " == " << itsNrBeams);
+      ASSERTSTR(directions.num_elements() == itsNrBeams * itsNrPencilBeams,
+                directions.num_elements() << " == " << itsNrBeams << "*" << itsNrPencilBeams);
 
-      ASSERTSTR(delays.size() == itsNrBeams,
-                delays.size() << " == " << itsNrBeams);
+      ASSERTSTR(delays.num_elements() == itsNrBeams * itsNrPencilBeams,
+                delays.num_elements() << " == " << itsNrBeams << "*" << itsNrPencilBeams);
 
       bufferUsed.down();
 
       // copy the directions at itsBuffer[head] into the provided buffer,
       // and calculate the respective delays
       for( unsigned b = 0; b < itsNrBeams; b++ ) {
-        const Direction &dir = itsBuffer[head][b];
+        for( unsigned p = 0; p < itsNrPencilBeams; p++ ) {
+          const Direction &dir = itsBuffer[head][b*itsNrPencilBeams + p];
 
-        directions[b] = dir;
-	delays[b] = dir * itsPhasePositionDiffs * (1.0 / speedOfLight);
+          directions[b][p] = dir;
+	  delays[b][p] = dir * itsPhasePositionDiffs * (1.0 / speedOfLight);
+	}  
       }
 
       // increment the head pointer
@@ -212,11 +216,18 @@ namespace LOFAR
 
     void WH_DelayCompensation::setBeamDirections(const Parset *ps)
     {
+      const PencilCoordinates& pencilBeams = ps->pencilBeams();
+
       // What coordinate system is used for these source directions?
       // Currently, we support J2000, ITRF, and AZEL.
       Direction::Types dirType(Direction::INVALID);
+
+      // TODO: For now, we include pencil beams for all regular beams,
+      // and use the pencil beam offsets as offsets in J2000.
+      // To do the coordinates properly, the offsets should be applied
+      // in today's coordinates (JMEAN/JTRUE?), not J2000.
       
-      itsBeamDirections.resize(itsNrBeams);
+      itsBeamDirections.resize(itsNrBeams*itsNrPencilBeams);
       
       // Get the source directions from the parameter set. 
       // Split the \a dir vector into separate Direction objects.
@@ -230,7 +241,17 @@ namespace LOFAR
         else THROW(IONProcException, "Observation.BeamDirectionType must be one of "
                        "J2000, ITRF, or AZEL");
 
-        itsBeamDirections[beam] = Direction(beamDir[0], beamDir[1], dirType);
+        for (unsigned pencil = 0; pencil < itsNrPencilBeams; pencil ++) {
+	  // obtain pencil coordinate
+	  const PencilCoord3D &pencilCoord = pencilBeams[pencil];
+
+	  // apply angle modification (TODO: different calculates for J2000 (ra/dec) and AZEL and ITRF!)
+	  const double angle1 = beamDir[0] + pencilCoord[0];
+	  const double angle2 = beamDir[1] + pencilCoord[1];
+
+	  // store beam
+          itsBeamDirections[beam*itsNrBeams + pencil] = Direction(angle1, angle2, dirType);
+	}
       }
     }
 
diff --git a/RTCP/IONProc/src/WH_DelayCompensation.h b/RTCP/IONProc/src/WH_DelayCompensation.h
index cfa11fb25e7fbcf712cd1585a84323281eadb66c..6427e7606e6962e037ae67fe99e915bc766c2c30 100644
--- a/RTCP/IONProc/src/WH_DelayCompensation.h
+++ b/RTCP/IONProc/src/WH_DelayCompensation.h
@@ -95,7 +95,8 @@ namespace LOFAR
       ~WH_DelayCompensation();
 
       // get the set of directions and delays for the beams, for the next CN integration time
-      void getNextDelays( vector<AMC::Direction> &directions, vector<double> &delays );
+      // Both matrices must have dimensions [itsNrBeams][itsNrPencilBeams]
+      void getNextDelays( Matrix<AMC::Direction> &directions, Matrix<double> &delays );
       
     private:
       // do the delay compensation calculations in a separate thread to allow bulk
@@ -137,6 +138,7 @@ namespace LOFAR
 
       // Beam info.
       const unsigned                itsNrBeams;
+      const unsigned                itsNrPencilBeams;
       vector<AMC::Direction>        itsBeamDirections;
 
       // Sample timings.
diff --git a/RTCP/Interface/include/Interface/CN_Configuration.h b/RTCP/Interface/include/Interface/CN_Configuration.h
index 5d1337ac777ea41fe66b93119c1a0ebf4b228448..646fff031d3bf17005399d5a45b04cecde132719 100644
--- a/RTCP/Interface/include/Interface/CN_Configuration.h
+++ b/RTCP/Interface/include/Interface/CN_Configuration.h
@@ -26,6 +26,7 @@
 #include <Stream/Stream.h>
 #include <Interface/MultiDimArray.h>
 #include <Interface/CN_Mode.h>
+#include <Interface/PencilCoordinates.h>
 #include <Interface/Parset.h>
 
 #include <vector>
@@ -57,17 +58,13 @@ class CN_Configuration
     double		  &sampleRate();
     std::vector<unsigned> &inputPsets(), &outputPsets(), &tabList();
     std::vector<double>	  &refFreqs();
-    unsigned              &nrPencilRings();
-    double                &pencilRingSize();
-    unsigned              &nrManualPencilBeams();
-    Matrix<double>        &manualPencilBeams();
+    PencilCoordinates     &pencilBeams();
     std::vector<double>   &refPhaseCentre();
     Matrix<double>        &phaseCentres();
     CN_Mode               &mode();
     bool                  &outputIncoherentStokesI();
     bool                  &stokesIntegrateChannels();
-
-    unsigned              nrPencilBeams() { return 3 * nrPencilRings() * (nrPencilRings() + 1) + 1 + nrManualPencilBeams(); }
+    unsigned              nrPencilBeams() const;
     
     void		  read(Stream *);
     void		  write(Stream *);
@@ -81,7 +78,7 @@ class CN_Configuration
     std::vector<unsigned> itsInputPsets, itsOutputPsets, itsTabList;
     std::vector<double>	  itsRefFreqs;
     std::vector<double>	  itsRefPhaseCentre;
-    Matrix<double>        itsManualPencilBeams;
+    PencilCoordinates     itsPencilBeams;
     Matrix<double>        itsPhaseCentres;
     CN_Mode               itsMode;
 
@@ -103,12 +100,10 @@ class CN_Configuration
       unsigned		  itsRefFreqsSize;
       unsigned		  itsInputPsets[MAX_PSETS], itsOutputPsets[MAX_PSETS], itsTabList[MAX_PSETS];
       double		  itsRefFreqs[MAX_SUBBANDS];
-      unsigned            itsNrPencilRings;
-      double              itsPencilRingSize;
       double              itsRefPhaseCentre[3];
       double              itsPhaseCentres[MAX_STATIONS * 3];
-      unsigned            itsNrManualPencilBeams;
-      double              itsManualPencilBeams[MAX_PENCILBEAMS * 2];
+      unsigned            itsNrPencilBeams;
+      double              itsPencilBeams[MAX_PENCILBEAMS * 2];
       bool                itsOutputIncoherentStokesI;
       bool                itsStokesIntegrateChannels;
     } itsMarshalledData;
@@ -195,24 +190,14 @@ inline std::vector<double> & CN_Configuration::refFreqs()
   return itsRefFreqs;
 }
 
-inline unsigned &CN_Configuration::nrPencilRings()
-{
-  return itsMarshalledData.itsNrPencilRings;
-}
-
-inline double &CN_Configuration::pencilRingSize()
-{
-  return itsMarshalledData.itsPencilRingSize;
-}
-
-inline unsigned &CN_Configuration::nrManualPencilBeams()
+inline PencilCoordinates &CN_Configuration::pencilBeams()
 {
-  return itsMarshalledData.itsNrManualPencilBeams;
+  return itsPencilBeams;
 }
 
-inline Matrix<double> &CN_Configuration::manualPencilBeams()
+inline unsigned CN_Configuration::nrPencilBeams() const
 {
-  return itsManualPencilBeams;
+  return itsPencilBeams.size();
 }
 
 inline std::vector<double> &CN_Configuration::refPhaseCentre()
diff --git a/RTCP/Interface/include/Interface/FilteredData.h b/RTCP/Interface/include/Interface/FilteredData.h
index 0975a50230a83fc49ad1e38094c50c0728ead2ba..b33e983070d012177e094b4265b29206d0b1e7f1 100644
--- a/RTCP/Interface/include/Interface/FilteredData.h
+++ b/RTCP/Interface/include/Interface/FilteredData.h
@@ -18,15 +18,15 @@ class FilteredData: public SampleData<fcomplex,4>
   public:
     typedef SampleData<fcomplex,4> SuperType;
 
-    FilteredData(const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration);
+    FilteredData(const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration, const unsigned nrStations);
 
-    virtual size_t requiredSize() const;
     virtual void allocate( Allocator &allocator = heapAllocator );
 
     Vector<SubbandMetaData>     metaData; //[itsNrStations]
 
   protected:
     const unsigned              itsNrStations;
+    const unsigned              itsNrPencilBeams;
     const unsigned              itsNrChannels;
     const unsigned              itsNrSamplesPerIntegration;
     virtual void readData( Stream* );
@@ -35,41 +35,45 @@ class FilteredData: public SampleData<fcomplex,4>
 
 
 
-inline FilteredData::FilteredData(const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration)
+inline FilteredData::FilteredData(const unsigned nrStations, const unsigned nrChannels, const unsigned nrSamplesPerIntegration, const unsigned nrPencilBeams)
 :
   // The "| 2" significantly improves transpose speeds for particular
   // numbers of stations due to cache conflict effects.  The extra memory
   // is not used.
   SuperType::SampleData(false,boost::extents[nrChannels][nrStations][nrSamplesPerIntegration | 2][NR_POLARIZATIONS], nrStations),
+  metaData(nrStations,32),
   itsNrStations(nrStations),
+  itsNrPencilBeams(nrPencilBeams),
   itsNrChannels(nrChannels),
   itsNrSamplesPerIntegration(nrSamplesPerIntegration)
 {
 }
 
-inline size_t FilteredData::requiredSize() const
-{
-  return SuperType::requiredSize() +
-         align(sizeof(SubbandMetaData) * itsNrStations, 32);
-}
-
 inline void FilteredData::allocate( Allocator &allocator )
 {
   SuperType::allocate( allocator );
-  metaData.resize( itsNrStations, 32, allocator );
+  metaData.resize( itsNrStations, 32 );
+  
+  for( unsigned i = 0; i < itsNrStations; i++ ) {
+    metaData[i] = SubbandMetaData( itsNrPencilBeams );
+  }
 }
 
 
 inline void FilteredData::readData(Stream *str)
 {
-  str->read(&metaData[0], metaData.size() * sizeof(SubbandMetaData));
+  for( unsigned subband = 0; subband < itsNrStations; subband++ ) {
+    metaData[subband].read( str );
+  }
   SuperType::readData(str);
 }
 
 
 inline void FilteredData::writeData(Stream *str)
 {
-  str->write(&metaData[0], metaData.size() * sizeof(SubbandMetaData));
+  for( unsigned subband = 0; subband < itsNrStations; subband++ ) {
+    metaData[subband].write( str );
+  }
   SuperType::writeData(str);
 }
 
diff --git a/RTCP/Interface/include/Interface/Makefile.am b/RTCP/Interface/include/Interface/Makefile.am
index 7ef160e42f71830e7c074a91ff73fa23abf16096..b1b02a8d46831179bf4fe8e487481132e64dd38c 100644
--- a/RTCP/Interface/include/Interface/Makefile.am
+++ b/RTCP/Interface/include/Interface/Makefile.am
@@ -21,6 +21,7 @@ pkginclude_HEADERS = Package__Version.h \
 		     PencilBeamData.h \
 		     PencilCoordinates.h \
                      StokesData.h \
-                     PipelineOutput.h
+                     PipelineOutput.h \
+                     PencilCoordinates.h
 
 include $(top_srcdir)/Makefile.common
diff --git a/RTCP/Interface/include/Interface/Parset.h b/RTCP/Interface/include/Interface/Parset.h
index d3f4fdd5ef206e69dcdc04ae72d97bf0c126e150..26cbf6c622ef93a196f8e41dc92bf2b5a726277d 100644
--- a/RTCP/Interface/include/Interface/Parset.h
+++ b/RTCP/Interface/include/Interface/Parset.h
@@ -36,6 +36,7 @@
 #include <Common/lofar_datetime.h>
 #include <Common/LofarLogger.h> 
 #include <Interface/Config.h>
+#include <Interface/PencilCoordinates.h>
 #include <ApplCommon/Observation.h>
 #include <Stream/Stream.h>
 
@@ -110,12 +111,8 @@ public:
 	bool           outputIncoherentStokesI() const;
 	bool           stokesIntegrateChannels() const;
 
-	uint32         nrManualPencilBeams() const;
-	vector<double> getManualPencilBeam( const unsigned pencil ) const;
-	uint32	       nrPencilRings() const;
-	double	       pencilRingSize() const;
-
 	uint32	       nrPencilBeams() const;
+	PencilCoordinates pencilBeams() const;
 
 	unsigned         nrSubbands() const;
 	unsigned         nrBeams() const;
@@ -150,6 +147,11 @@ public:
 	vector<double> itsStPositions;
 	
 private:
+	uint32         nrManualPencilBeams() const;
+	vector<double> getManualPencilBeam( const unsigned pencil ) const;
+	uint32	       nrPencilRings() const;
+	double	       pencilRingSize() const;
+
 	void           addPosition(string stName);
 	double	       getTime(const char *name) const;
 	static int     findIndex(uint32 pset, const vector<uint32> &psets);
@@ -435,6 +437,17 @@ inline uint32 Parset::nrPencilBeams() const
   return 3 * nrPencilRings() * (nrPencilRings() + 1) + 1 + nrManualPencilBeams();
 }
 
+inline PencilCoordinates Parset::pencilBeams() const
+{
+  // include both the pencil rings and the manually defined pencil beam coordinates
+  PencilRings coordinates( nrPencilRings(), pencilRingSize() );
+  for( unsigned i = 0; i < nrManualPencilBeams(); i++ ) {
+    coordinates += PencilCoord3D( getManualPencilBeam( i ) );
+  }
+
+  return coordinates;
+}
+
 inline double Parset::pencilRingSize() const
 {
   return getDouble("Observation.pencilRingSize");
diff --git a/RTCP/Interface/include/Interface/PencilCoordinates.h b/RTCP/Interface/include/Interface/PencilCoordinates.h
index 5e7c9b7f3c58607f5cd3a0b59ec53b3d0e8e941b..984267478b418e531827a9776f7a5ed4601a21b3 100644
--- a/RTCP/Interface/include/Interface/PencilCoordinates.h
+++ b/RTCP/Interface/include/Interface/PencilCoordinates.h
@@ -89,7 +89,12 @@ class PencilCoord3D {
       return *this;
     }
 
-    inline double& operator[]( const unsigned i )
+    inline const double operator[]( const unsigned i ) const
+    {
+      return itsXYZ[i];
+    }
+
+    inline double &operator[]( const unsigned i )
     {
       return itsXYZ[i];
     }
@@ -117,12 +122,15 @@ class PencilCoordinates
     PencilCoordinates( const std::vector<PencilCoord3D> &coordinates ): itsCoordinates(coordinates) {}
     PencilCoordinates( const Matrix<double> &coordinates );
 
-    std::vector<PencilCoord3D>& getCoordinates()
+    inline std::vector<PencilCoord3D>& getCoordinates() 
     { return itsCoordinates; }
 
-    size_t size() const
+    inline size_t size() const
     { return itsCoordinates.size(); }
 
+    inline const PencilCoord3D& operator[]( const unsigned nr ) const
+    { return itsCoordinates[nr]; }
+
     void read( Stream *s );
     void write( Stream *s ) const;
 
diff --git a/RTCP/Interface/include/Interface/PipelineOutput.h b/RTCP/Interface/include/Interface/PipelineOutput.h
index 240725b21e93d489a22ab6216f16888b218f0c0a..eb8939c6bcdd1b7df0818a57aaf0125a1ad8669e 100644
--- a/RTCP/Interface/include/Interface/PipelineOutput.h
+++ b/RTCP/Interface/include/Interface/PipelineOutput.h
@@ -128,7 +128,7 @@ inline PipelineOutputSet::PipelineOutputSet( const Parset &ps, Allocator &alloca
   switch( mode.mode() ) {
     case CN_Mode::FILTER:
 	o = new PipelineOutput( id++, PipelineOutput::FILTEREDDATA );
-        o->itsData = new FilteredData( ps.nrStations(), ps.nrChannelsPerSubband(), ps.CNintegrationSteps() );
+        o->itsData = new FilteredData( ps.nrStations(), ps.nrChannelsPerSubband(), ps.CNintegrationSteps(), ps.nrPencilBeams() );
         break;
 
     case CN_Mode::CORRELATE:
diff --git a/RTCP/Interface/include/Interface/SubbandMetaData.h b/RTCP/Interface/include/Interface/SubbandMetaData.h
index ab7ff4290e52e30141df5e4fd6b057474bc44198..1e9115f111ebfc5e3a6c63e5a7adbff7a81fd2d0 100644
--- a/RTCP/Interface/include/Interface/SubbandMetaData.h
+++ b/RTCP/Interface/include/Interface/SubbandMetaData.h
@@ -24,7 +24,9 @@
 #define LOFAR_INTERFACE_SUBBAND_META_DATA_H
 
 #include <Interface/SparseSet.h>
+#include <Interface/MultiDimArray.h>
 #include <Stream/Stream.h>
+#include <Common/LofarLogger.h>
 
 #include <cassert>
 
@@ -32,38 +34,101 @@
 namespace LOFAR {
 namespace RTCP {
 
+// Note: struct must remain copyable to avoid ugly constructions when passing it around
 struct SubbandMetaData
 {
   public:
+    SubbandMetaData();
+    SubbandMetaData( const unsigned nrBeams );
+
     SparseSet<unsigned>	getFlags() const;
     void		setFlags(const SparseSet<unsigned> &);
 
-    float		delayAtBegin, delayAfterEnd;
-    double              beamDirectionAtBegin[3], beamDirectionAfterEnd[3];
+    unsigned            alignmentShift() const;
+    unsigned            &alignmentShift();
+
+    void read( Stream *str );
+    void write( Stream *str ) const;
+
+    struct beamInfo {
+      float delayAtBegin, delayAfterEnd;
+      double beamDirectionAtBegin[3], beamDirectionAfterEnd[3];
+    };
 
-    unsigned		alignmentShift;
+    // Note: CNProc/AsyncTranspose reads the information below directly
+    std::vector<struct beamInfo> beams;
 
-  private:
-    unsigned char	flagsBuffer[132];
+    struct marshallData {
+      unsigned char	flagsBuffer[132];
+      unsigned		alignmentShift;
+    } itsMarshalledData;
 };
 
+inline SubbandMetaData::SubbandMetaData()
+{
+  // Used for constructing vectors, such as Vector<SubbandMetaData> metaData( itsNrStations, 32, allocator )
+  // Without a default constructor, construction of such an array is possible, but quite awkward as it
+  // requires the manual construction and destruction of each element:
+  //
+  // SubbandMetaData *metaData = allocator.allocate(itsNrStations*sizeof(SubbandMetaData),32)
+  // for( unsigned i = 0; i < itsNrStations; i++ ) {
+  //   metaData[i] = new SubbandMetaData( ... );
+  // }
+  // ...
+  // for( unsigned i = 0; i < itsNrStations; i++ ) {
+  //   delete metaData[i];
+  // }
+  // allocator.deallocate( metaData );
+  //
+  // NOTE: While C++ has a new[](...) construction especially for this use case, it is not part of ISO C++.
+}
+
+inline SubbandMetaData::SubbandMetaData( const unsigned nrBeams )
+: 
+  beams(nrBeams)
+{
+}
 
 inline SparseSet<unsigned> SubbandMetaData::getFlags() const
 {
   SparseSet<unsigned> flags;
 
-  flags.unmarshall(flagsBuffer);
+  flags.unmarshall(itsMarshalledData.flagsBuffer);
   return flags;
 }
 
-
 inline void SubbandMetaData::setFlags(const SparseSet<unsigned> &flags)
 {
-  ssize_t size = flags.marshall(&flagsBuffer, sizeof flagsBuffer);
+  ssize_t size = flags.marshall(&itsMarshalledData.flagsBuffer, sizeof itsMarshalledData.flagsBuffer);
   
   assert(size >= 0);
 }
 
+inline unsigned SubbandMetaData::alignmentShift() const
+{
+  return itsMarshalledData.alignmentShift;
+}
+
+inline unsigned &SubbandMetaData::alignmentShift()
+{
+  return itsMarshalledData.alignmentShift;
+}
+
+inline void SubbandMetaData::read( Stream *str )
+{
+  // TODO: endianness
+
+  str->read(&itsMarshalledData, sizeof itsMarshalledData);
+  str->read(&beams.front(), beams.size() * sizeof beams[0] );
+}
+
+inline void SubbandMetaData::write( Stream *str ) const
+{
+  // TODO: endianness
+
+  str->write(&itsMarshalledData, sizeof itsMarshalledData);
+  str->write(&beams.front(), beams.size() * sizeof beams[0] );
+}
 
 } // namespace RTCP
 } // namespace LOFAR
diff --git a/RTCP/Interface/src/CN_Configuration.cc b/RTCP/Interface/src/CN_Configuration.cc
index 502d31b8280bca50195ed17a9727170228f2e9ac..01e75ca82f7e607ee5869f1368f10b83c1d478cb 100644
--- a/RTCP/Interface/src/CN_Configuration.cc
+++ b/RTCP/Interface/src/CN_Configuration.cc
@@ -47,9 +47,7 @@ CN_Configuration::CN_Configuration( const Parset &parset )
   outputPsets()             = parset.getUint32Vector("OLAP.CNProc.outputPsets");
   tabList()                 = parset.getUint32Vector("OLAP.CNProc.tabList");
   refFreqs()                = parset.subbandToFrequencyMapping();
-  nrPencilRings()           = parset.nrPencilRings();
-  pencilRingSize()          = parset.pencilRingSize();
-  nrManualPencilBeams()     = parset.nrManualPencilBeams();
+  pencilBeams()             = parset.pencilBeams();
   refPhaseCentre()          = parset.getRefPhaseCentres();
   mode()                    = CN_Mode(parset);
   outputIncoherentStokesI() = parset.outputIncoherentStokesI();
@@ -68,15 +66,6 @@ CN_Configuration::CN_Configuration( const Parset &parset )
       itsPhaseCentres[stat][dim] = positions[stat*3+dim];
     }
   }
-
-  itsManualPencilBeams.resize( parset.nrManualPencilBeams(), 2 );
-  for( unsigned beam = 0; beam < parset.nrManualPencilBeams(); beam++ ) {
-    std::vector<double> coordinates = parset.getManualPencilBeam( beam );
-
-    for( unsigned dim = 0; dim < 2; dim++ ) {
-      itsManualPencilBeams[beam][dim] = coordinates[dim];
-    }
-  }
 }
 
 #endif
@@ -86,6 +75,7 @@ void CN_Configuration::read(Stream *str)
 {
   str->read(&itsMarshalledData, sizeof itsMarshalledData);
   itsMode.read(str);
+  itsPencilBeams.read(str);
 
   itsInputPsets.resize(itsMarshalledData.itsInputPsetsSize);
   memcpy(&itsInputPsets[0], itsMarshalledData.itsInputPsets, itsMarshalledData.itsInputPsetsSize * sizeof(unsigned));
@@ -106,11 +96,6 @@ void CN_Configuration::read(Stream *str)
   for( unsigned stat = 0; stat < nrStations(); stat++ ) {
     memcpy(&itsPhaseCentres[stat][0], &itsMarshalledData.itsPhaseCentres[stat*3], 3 * sizeof(double));
   }
-
-  itsManualPencilBeams.resize(nrManualPencilBeams(),2);
-  for( unsigned beam = 0; beam < nrManualPencilBeams(); beam++ ) {
-    memcpy(&itsManualPencilBeams[beam][0], &itsMarshalledData.itsManualPencilBeams[beam*2], 2 * sizeof(double));
-  }
 }
 
 
@@ -137,12 +122,9 @@ void CN_Configuration::write(Stream *str)
     memcpy(&itsMarshalledData.itsPhaseCentres[stat*3], &itsPhaseCentres[stat][0], 3 * sizeof(double));
   }
 
-  for( unsigned beam = 0; beam < nrManualPencilBeams(); beam++ ) {
-    memcpy(&itsMarshalledData.itsManualPencilBeams[beam*2], &itsManualPencilBeams[beam][0], 2 * sizeof(double));
-  }
-
   str->write(&itsMarshalledData, sizeof itsMarshalledData);
   itsMode.write(str);
+  itsPencilBeams.write(str);
 }
 
 } // namespace RTCP
diff --git a/RTCP/Interface/src/PencilCoordinates.cc b/RTCP/Interface/src/PencilCoordinates.cc
index 2ab0728098e5c55bf0e0259b45a9bb438036c417..e7fc46a09d1c258b03d1d4837aeaf2972bd66bb7 100644
--- a/RTCP/Interface/src/PencilCoordinates.cc
+++ b/RTCP/Interface/src/PencilCoordinates.cc
@@ -153,7 +153,7 @@ double PencilRings::pencilHeightDelta() const
 
 void PencilRings::computeBeamCoordinates()
 {
-  std::vector<PencilCoord3D> &coordinates = getCoordinates();
+  std::vector<PencilCoord3D> coordinates;
   double dl[6], dm[6];
 
   // stride for each side, starting from the top, clock-wise
@@ -203,7 +203,6 @@ void PencilRings::computeBeamCoordinates()
   dm[5] = pencilHeightDelta();
 
   // ring 0: the center pencil beam
-  coordinates.reserve(nrPencils());
   coordinates.push_back( PencilCoord3D( 0, 0 ) );
 
   // ring 1-n: create the pencil beams from the inner ring outwards
@@ -219,6 +218,8 @@ void PencilRings::computeBeamCoordinates()
       }
     }
   }
+
+  *this += coordinates;
 }
 
 }