From 562e9e00465ed2fd0dd9836e7c45b466683dbfe0 Mon Sep 17 00:00:00 2001
From: Martin Gels <gels@astron.nl>
Date: Tue, 24 Jun 2008 07:41:42 +0000
Subject: [PATCH] Bug 1005: made CS1_IONProc compatible for BGP

---
 Appl/CEP/CS1/CS1_BGLProc/src/Allocator.cc     |  60 --------
 Appl/CEP/CS1/CS1_BGLProc/src/Allocator.h      |  39 -----
 .../CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc | 142 +++++-------------
 Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.h |  10 +-
 .../src/CS1_BGL_Processing_main.cc            |  30 +++-
 .../CS1_BGLProc/src/CacheAlignedAllocator.h   |   2 +-
 Appl/CEP/CS1/CS1_BGLProc/src/Correlator.cc    |   5 +-
 Appl/CEP/CS1/CS1_BGLProc/src/Correlator.h     |   2 +-
 Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.S  |  66 +++++++-
 Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.h  |   2 +-
 Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.S        |   2 +-
 Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.h        |   2 +-
 Appl/CEP/CS1/CS1_BGLProc/src/FIR.h            |   2 +-
 Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S        |  36 ++++-
 Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h        |   7 +-
 Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.cc  | 134 +++++++++++++++++
 Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.h   |  67 +++++++++
 Appl/CEP/CS1/CS1_BGLProc/src/Makefile.am      |   4 +-
 Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc           |   2 +-
 Appl/CEP/CS1/CS1_BGLProc/src/PPF.h            |   2 +-
 Appl/CEP/CS1/CS1_BGLProc/src/Transpose.cc     |  87 +++++++----
 Appl/CEP/CS1/CS1_BGLProc/src/Transpose.h      |  21 ++-
 22 files changed, 451 insertions(+), 273 deletions(-)
 delete mode 100644 Appl/CEP/CS1/CS1_BGLProc/src/Allocator.cc
 delete mode 100644 Appl/CEP/CS1/CS1_BGLProc/src/Allocator.h
 create mode 100644 Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.cc
 create mode 100644 Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.h

diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Allocator.cc b/Appl/CEP/CS1/CS1_BGLProc/src/Allocator.cc
deleted file mode 100644
index 529096b6194..00000000000
--- a/Appl/CEP/CS1/CS1_BGLProc/src/Allocator.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <lofar_config.h>
-
-#include <Allocator.h>
-
-#include <malloc.h>
-
-
-namespace LOFAR {
-namespace CS1 {
-
-Heap::Heap(size_t heapSize, int alignment)
-{
-  size = heapSize;
-
-  if (posix_memalign(&start, alignment, heapSize) != 0) {
-    std::cerr << "could not allocate heap" << std::endl;
-    exit(1);
-  }
-}
-
-
-Heap::~Heap()
-{
-  free(start);
-}
-
-
-Overlay::Overlay(const Heap &heap)
-{
-
-  freeList.include(heap.start, (void *) ((char *) heap.start + heap.size));
-}
-
-
-void *Overlay::allocate(size_t size, int alignment)
-{
-  for (SparseSet<void *>::const_iterator it = freeList.getRanges().begin(); it != freeList.getRanges().end(); it ++) {
-    void *begin = (void *) (((size_t) it->begin + alignment - 1) & ~(alignment - 1));
-
-    if ((char *) it->end - (char *) begin >= (ptrdiff_t) size) {
-      freeList.exclude(begin, (void *) ((char *) begin + size));
-      sizes[begin] = size;
-      return begin;
-    }
-  }
-
-  std::cerr << "could not allocate memory from heap" << std::endl;
-  std::exit(1);
-}
-
-
-void Overlay::deallocate(void *ptr)
-{
-  std::map<void *, size_t>::iterator index = sizes.find(ptr);
-  freeList.include(ptr, (void *) ((char *) ptr + index->second));
-  sizes.erase(index);
-}
-
-} // namespace CS1
-} // namespace LOFAR
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Allocator.h b/Appl/CEP/CS1/CS1_BGLProc/src/Allocator.h
deleted file mode 100644
index 720712ed22c..00000000000
--- a/Appl/CEP/CS1/CS1_BGLProc/src/Allocator.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_ALLOCATOR_H
-#define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_ALLOCATOR_H
-
-#include <CS1_Interface/SparseSet.h>
-#include <map>
-
-namespace LOFAR {
-namespace CS1 {
-
-class Heap
-{
-  public:
-    Heap(size_t heapSize, int alignment);
-    ~Heap();
-
-  private:
-    friend class Overlay;
-    void	 *start;
-    size_t	 size;
-};
-
-
-class Overlay
-{
-  public:
-    Overlay(const Heap &);
-
-    void *allocate(size_t size, int alignment);
-    void deallocate(void *ptr);
-
-  private:
-    SparseSet<void *>	     freeList;
-    std::map<void *, size_t> sizes;
-};
-
-} // namespace CS1
-} // namespace LOFAR
-
-#endif
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc b/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc
index b48d087ae54..04c23602d59 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc
@@ -30,7 +30,6 @@
 #include <Transport/TH_MPI.h>
 #include <CS1_Interface/BGL_Configuration.h>
 #include <CS1_Interface/BGL_Mapping.h>
-#include <CS1_Interface/PrintVector.h>
 
 #include <cassert>
 #include <complex>
@@ -39,8 +38,12 @@
 #include <iostream>
 #include <map>
 
+#if defined HAVE_BGP
+#include <common/bgp_personality_inlines.h>
+#include <spi/kernel_interface.h>
+#endif
 
-#if defined HAVE_ZOID && defined HAVE_BGL
+#if defined HAVE_ZOID && (defined HAVE_BGL || defined HAVE_BGP)
 extern "C" {
 #include <lofar.h>
 }
@@ -48,7 +51,11 @@ extern "C" {
 #endif
 
 #if defined HAVE_MPI
+#if defined HAVE_BGP || defined HAVE_BGL
+#define LOG_CONDITION	(itsCurrentSubband == itsFirstSubband)
+#else
 #define LOG_CONDITION	(itsRankInPset == 0)
+#endif
 //#define LOG_CONDITION	(TH_MPI::getCurrentRank() == 0)
 #else
 #define LOG_CONDITION	1
@@ -73,14 +80,15 @@ static NSTimer computeTimer("computing", true);
 char **BGL_Processing::original_argv;
 
 
-BGL_Processing::BGL_Processing(TransportHolder *th)
+BGL_Processing::BGL_Processing(TransportHolder *th, const LocationInfo &locationInfo)
 :
   itsTransportHolder(th),
+  itsLocationInfo(locationInfo),
   itsInputData(0),
   itsTransposedData(0),
   itsFilteredData(0),
   itsCorrelatedData(0),
-#if defined HAVE_BGL
+#if defined HAVE_BGL || defined HAVE_BGP
   itsTranspose(0),
 #endif
   itsPPF(0),
@@ -88,11 +96,11 @@ BGL_Processing::BGL_Processing(TransportHolder *th)
 {
   memset(itsArenas, 0, sizeof itsArenas);
 
-#if defined HAVE_BGL
-  getPersonality();
-#endif
+// #if defined HAVE_BGL
+//   getPersonality();
+// #endif
 
-#if defined HAVE_ZOID && defined HAVE_BGL
+#if defined HAVE_ZOID && (defined HAVE_BGL || defined HAVE_BGP)
   initIONode();
 #endif
 }
@@ -103,7 +111,8 @@ BGL_Processing::~BGL_Processing()
 }
 
 
-#if defined HAVE_BGL
+#if 0
+  //#if defined HAVE_BGL
 
 struct Location {
   unsigned pset, rankInPset;
@@ -142,21 +151,21 @@ void BGL_Processing::getPersonality()
     for (unsigned rank = 0; rank < allLocations.size(); rank ++)
       cores[allLocations[rank].pset][allLocations[rank].rankInPset] = rank;
 
-    for (unsigned pset = 0; pset < itsPersonality.numPsets(); pset ++)
-      std::clog << "pset " << pset << " contains cores " << cores[pset] << std::endl;
+//     for (unsigned pset = 0; pset < itsPersonality.numPsets(); pset ++)
+//       std::clog << "pset " << pset << " contains cores " << cores[pset] << std::endl;
   }
 }
 
 #endif
 
 
-#if defined HAVE_ZOID && defined HAVE_BGL
+#if defined HAVE_ZOID && (defined HAVE_BGL || defined HAVE_BGP)
 
 void BGL_Processing::initIONode() const
 {
   // one of the compute cores in each Pset has to initialize its I/O node
 
-  if (itsRankInPset == 0) {
+  if (itsLocationInfo.rankInPset() == 0) {
     std::vector<size_t> lengths;
 
     for (int arg = 0; original_argv[arg] != 0; arg ++) {
@@ -226,96 +235,18 @@ void BGL_Processing::printSubbandList() const
 #endif
 
 
-
-#if 0
-void BGL_Processing::preprocess(CS1_Parset *parset)
-{
-  checkConsistency(parset);
-  
-#if defined HAVE_BGL
-  unsigned usedCoresPerPset = parset->nrCoresPerPset();
-  unsigned myPset	    = itsPersonality.getPsetNum();
-  unsigned myCore	    = BGL_Mapping::reverseMapCoreOnPset(itsRankInPset, myPset);
-#else
-  unsigned usedCoresPerPset = 1;
-  unsigned myPset	    = 0;
-  unsigned myCore	    = 0;
-#endif
-
-  vector<unsigned> inputPsets  = parset->getUint32Vector("OLAP.BGLProc.inputPsets");
-  vector<unsigned> outputPsets = parset->getUint32Vector("OLAP.BGLProc.outputPsets");
-
-#if defined HAVE_BGL
-  Transpose::getMPIgroups(usedCoresPerPset, itsPersonality, inputPsets, outputPsets);
-#endif
-
-  vector<unsigned>::const_iterator inputPsetIndex  = std::find(inputPsets.begin(),  inputPsets.end(),  myPset);
-  vector<unsigned>::const_iterator outputPsetIndex = std::find(outputPsets.begin(), outputPsets.end(), myPset);
-
-  itsIsTransposeInput  = inputPsetIndex  != inputPsets.end();
-  itsIsTransposeOutput = outputPsetIndex != outputPsets.end();
-
-  unsigned nrStations		   = parset->nrStations();
-  unsigned nrBaselines		   = nrStations * (nrStations + 1) / 2;
-  unsigned nrSamplesPerIntegration = parset->BGLintegrationSteps();
-  unsigned nrSamplesToBGLProc	   = parset->nrSamplesToBGLProc();
-
-  size_t inputDataSize      = itsIsTransposeInput  ? InputData::requiredSize(outputPsets.size(), nrSamplesToBGLProc) : 0;
-  size_t transposedDataSize = itsIsTransposeOutput ? TransposedData::requiredSize(nrStations, nrSamplesToBGLProc) : 0;
-  size_t filteredDataSize   = itsIsTransposeOutput ? FilteredData::requiredSize(nrStations, nrSamplesPerIntegration) : 0;
-  size_t correlatedDataSize = itsIsTransposeOutput ? CorrelatedData::requiredSize(nrBaselines) : 0;
-
-  itsHeaps[0] = new Heap(std::max(inputDataSize, filteredDataSize), 32);
-  itsHeaps[1] = new Heap(std::max(transposedDataSize, correlatedDataSize), 32);
-
-  if (itsIsTransposeInput) {
-    itsInputData = new InputData(*itsHeaps[0], outputPsets.size(), nrSamplesToBGLProc);
-  }
-
-  if (itsIsTransposeOutput) {
-    // FIXME: !useGather not implemented
-    ASSERT(parset->getBool("OLAP.IONProc.useGather"));
-
-    unsigned nrSubbandsPerPset	= parset->nrSubbandsPerPset();
-    unsigned logicalNode	= usedCoresPerPset * (outputPsetIndex - outputPsets.begin()) + myCore;
-    // TODO: logicalNode assumes output psets are consecutively numbered
-
-    itsCenterFrequencies = parset->refFreqs();
-    itsFirstSubband	 = (logicalNode / usedCoresPerPset) * nrSubbandsPerPset;
-    itsLastSubband	 = itsFirstSubband + nrSubbandsPerPset;
-    itsCurrentSubband	 = itsFirstSubband + logicalNode % usedCoresPerPset % nrSubbandsPerPset;
-    itsSubbandIncrement	 = usedCoresPerPset % nrSubbandsPerPset;
-
-#if defined HAVE_MPI
-    printSubbandList();
-#endif
-
-    itsTransposedData = new TransposedData(*itsHeaps[1], nrStations, nrSamplesToBGLProc);
-    itsFilteredData   = new FilteredData(*itsHeaps[0], nrStations, nrSamplesPerIntegration);
-    itsCorrelatedData = new CorrelatedData(*itsHeaps[1], nrBaselines);
-
-    itsPPF	      = new PPF(nrStations, nrSamplesPerIntegration, parset->sampleRate() / NR_SUBBAND_CHANNELS, parset->getBool("OLAP.delayCompensation"));
-    itsCorrelator     = new Correlator(nrStations, nrSamplesPerIntegration);
-  }
-
-#if defined HAVE_MPI
-  if (itsIsTransposeInput || itsIsTransposeOutput) {
-    itsTranspose = new Transpose(itsIsTransposeInput, itsIsTransposeOutput, myCore, nrStations);
-    itsTranspose->setupTransposeParams(inputPsets, outputPsets, itsInputData, itsTransposedData);
-  }
-#endif
-}
-
-#else
-
 void BGL_Processing::preprocess(BGL_Configuration &configuration)
 {
   //checkConsistency(parset);	TODO
 
-#if defined HAVE_BGL
+// #if defined HAVE_BGL
+//   unsigned usedCoresPerPset = configuration.nrUsedCoresPerPset();
+//   unsigned myPset	    = itsPersonality.getPsetNum();
+//   unsigned myCore	    = BGL_Mapping::reverseMapCoreOnPset(itsRankInPset, myPset);
+#if defined HAVE_BGL || HAVE_BGP
   unsigned usedCoresPerPset = configuration.nrUsedCoresPerPset();
-  unsigned myPset	    = itsPersonality.getPsetNum();
-  unsigned myCore	    = BGL_Mapping::reverseMapCoreOnPset(itsRankInPset, myPset);
+  unsigned myPset	    = itsLocationInfo.psetNumber();
+  unsigned myCore	    = BGL_Mapping::reverseMapCoreOnPset(itsLocationInfo.rankInPset(), myPset);
 #else
   unsigned usedCoresPerPset = 1;
   unsigned myPset	    = 0;
@@ -324,8 +255,10 @@ void BGL_Processing::preprocess(BGL_Configuration &configuration)
   std::vector<unsigned> &inputPsets  = configuration.inputPsets();
   std::vector<unsigned> &outputPsets = configuration.outputPsets();
 
-#if defined HAVE_BGL
-  Transpose::getMPIgroups(usedCoresPerPset, itsPersonality, inputPsets, outputPsets);
+// #if defined HAVE_BGL
+//   Transpose::getMPIgroups(usedCoresPerPset, itsPersonality, inputPsets, outputPsets);
+#if defined HAVE_BGP || defined HAVE_BGL
+  Transpose::getMPIgroups(usedCoresPerPset, itsLocationInfo, inputPsets, outputPsets);
 #endif
 
   std::vector<unsigned>::const_iterator inputPsetIndex  = std::find(inputPsets.begin(),  inputPsets.end(),  myPset);
@@ -397,13 +330,13 @@ void BGL_Processing::preprocess(BGL_Configuration &configuration)
   }
 #endif
 }
-#endif
 
 
 void BGL_Processing::process()
 {
   NSTimer totalTimer("total", LOG_CONDITION);
   totalTimer.start();
+
   if (itsIsTransposeInput) {
 #if defined HAVE_MPI
     if (LOG_CONDITION)
@@ -446,9 +379,6 @@ MPI_Barrier(itsTransposeGroup);
     itsCorrelator->computeFlagsAndCentroids(itsFilteredData, itsCorrelatedData);
     itsCorrelator->correlate(itsFilteredData, itsCorrelatedData);
 
-    if ((itsCurrentSubband += itsSubbandIncrement) >= itsLastSubband)
-      itsCurrentSubband -= itsLastSubband - itsFirstSubband;
-
     computeTimer.stop();
 
 #if defined HAVE_MPI
@@ -475,6 +405,10 @@ MPI_Barrier(itsTransposeGroup);
     for (double time = MPI_Wtime() + 4.0; MPI_Wtime() < time;)
       ;
 #endif
+
+  if ((itsCurrentSubband += itsSubbandIncrement) >= itsLastSubband)
+    itsCurrentSubband -= itsLastSubband - itsFirstSubband;
+
   totalTimer.stop();
 }
 
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.h b/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.h
index 23152837bf8..fd4cb500602 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.h
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.h
@@ -21,7 +21,7 @@
 #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_BGL_PROCESSING_H
 #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_BGL_PROCESSING_H
 
-#if 0 || !defined HAVE_BGL
+#if 0 || !(defined HAVE_BGL || defined HAVE_BGP)
 #define C_IMPLEMENTATION
 #endif
 
@@ -34,6 +34,7 @@
 #endif
 
 #include <CS1_Interface/Allocator.h>
+
 #include <InputData.h>
 #include <FilteredData.h>
 #include <TransposedData.h>
@@ -43,9 +44,7 @@
 #include <PPF.h>
 #include <Correlator.h>
 
-#if defined HAVE_BGL
-#include <mpi.h>
-#endif
+#include <LocationInfo.h>
 
 #if defined HAVE_BGL
 #include <bglpersonality.h>
@@ -60,7 +59,7 @@ namespace CS1 {
 
 class BGL_Processing {
   public:
-			BGL_Processing(TransportHolder *th);
+			BGL_Processing(TransportHolder *, const LocationInfo &);
 			~BGL_Processing();
 
 #if 0
@@ -91,6 +90,7 @@ class BGL_Processing {
 #endif
 
     TransportHolder	*itsTransportHolder;
+    const LocationInfo	&itsLocationInfo;
     std::vector<double> itsCenterFrequencies;
     unsigned    	itsFirstSubband, itsCurrentSubband, itsLastSubband, itsSubbandIncrement;
     bool		itsIsTransposeInput, itsIsTransposeOutput;
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/CS1_BGL_Processing_main.cc b/Appl/CEP/CS1/CS1_BGLProc/src/CS1_BGL_Processing_main.cc
index b95c7cee955..510a5d685d7 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/CS1_BGL_Processing_main.cc
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/CS1_BGL_Processing_main.cc
@@ -26,9 +26,12 @@
 #include <Common/Exception.h>
 #include <CS1_Interface/BGL_Command.h>
 #include <CS1_Interface/BGL_Configuration.h>
+#include <Transport/TH_File.h>
 #include <Transport/TH_Null.h>
+#include <Transport/TH_Socket.h>
 #include <CS1_BGLProc/TH_ZoidClient.h>
 #endif
+#include <CS1_BGLProc/LocationInfo.h>
 #include <CS1_BGLProc/BGL_Processing.h>
 #include <CS1_BGLProc/Package__Version.h>
 #include <Transport/TH_MPI.h>
@@ -40,6 +43,8 @@ using namespace LOFAR::CS1;
 
 int main(int argc, char **argv)
 {
+  std::clog.rdbuf(std::cout.rdbuf());
+
   try {
     BGL_Processing::original_argv = argv;
 
@@ -53,18 +58,34 @@ int main(int argc, char **argv)
       Version::show<CS1_BGLProcVersion> (std::cout, "CS1_BGLProc", type);
     }
 
+    LocationInfo   locationInfo;
+
 #if defined HAVE_ZOID && defined HAVE_BGL
-    TH_ZoidClient     th;
+    TH_ZoidClient  th;
+#elif 0
+    TH_Null	   th;
+#elif 0
+    usleep(10000 * locationInfo.rankInPset()); // do not connect all at the same time
+
+    TH_Socket	   th("127.0.0.1", boost::lexical_cast<string>(5000 + locationInfo.rankInPset()));
+
+    while (!th.init())
+      sleep(1);
 #else
-    TH_Null	      th;
+    TH_File	   th(string("/tmp/sock.") + boost::lexical_cast<string>(locationInfo.rankInPset()), TH_File::Read);
+
+    while (!th.init())
+      sleep(1);
 #endif
 
-    BGL_Processing    proc(&th);
-    BGL_Command	      command;
+    BGL_Processing proc(&th, locationInfo);
+    BGL_Command	   command;
 
     do {
+std::clog << TH_MPI::getCurrentRank() << " read command" << std::endl;
       command.read(&th);
 
+std::clog << TH_MPI::getCurrentRank() << " received command " << (unsigned) command.value() << std::endl;
       switch (command.value()) {
 	case BGL_Command::PREPROCESS :	{
 					  BGL_Configuration configuration;
@@ -82,6 +103,7 @@ int main(int argc, char **argv)
 
 	default :			break;
       }
+std::clog << TH_MPI::getCurrentRank() << " command handled" << std::endl;
     } while (command.value() != BGL_Command::STOP);
 
 #if defined HAVE_MPI
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/CacheAlignedAllocator.h b/Appl/CEP/CS1/CS1_BGLProc/src/CacheAlignedAllocator.h
index 71898996683..ebff5e89ef0 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/CacheAlignedAllocator.h
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/CacheAlignedAllocator.h
@@ -5,7 +5,7 @@
 #include <memory>
 
 
-#if defined HAVE_BGL
+#if defined HAVE_BGL || defined HAVE_BGP
 #define CACHE_LINE_SIZE	32
 #define CACHE_ALIGNED	__attribute__ ((aligned(CACHE_LINE_SIZE)))
 #else
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.cc b/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.cc
index 7e623cf2706..2656c6fffa7 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.cc
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.cc
@@ -190,6 +190,7 @@ void Correlator::correlate(const FilteredData *filteredData, CorrelatedData *cor
     for (unsigned stat2 = nrValidStations % 2 ? 1 : 2; stat2 < nrValidStations; stat2 += 2) {
       unsigned stat1 = 0;
 
+#if defined HAVE_BGL
       // do as many 3x2 blocks as possible
       for (; stat1 + 3 <= stat2; stat1 += 3) { 
 	unsigned stat10 = map[stat1], stat11 = map[stat1+1], stat12 = map[stat1+2];
@@ -208,9 +209,10 @@ void Correlator::correlate(const FilteredData *filteredData, CorrelatedData *cor
 		       correlatedData->visibilities[baseline(stat12, stat21)][ch].origin(),
 		       itsNrSamplesPerIntegration);
       }
+#endif
 
       // see if a 2x2 block is necessary
-      if (stat1 + 2 <= stat2) {
+      for (; stat1 + 2 <= stat2; stat1 += 2) {
 	unsigned stat10 = map[stat1], stat11 = map[stat1+1];
 	unsigned stat20 = map[stat2], stat21 = map[stat2+1];
 
@@ -223,7 +225,6 @@ void Correlator::correlate(const FilteredData *filteredData, CorrelatedData *cor
 		       correlatedData->visibilities[baseline(stat11, stat20)][ch].origin(),
 		       correlatedData->visibilities[baseline(stat11, stat21)][ch].origin(),
 		       itsNrSamplesPerIntegration);
-	stat1 += 2;
       }
 
       // do the remaining (auto)correlations near the diagonal
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.h b/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.h
index 730b1d9de70..0ebae44d791 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.h
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.h
@@ -1,7 +1,7 @@
 #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_CORRELATOR_H
 #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_CORRELATOR_H
 
-#if 0 || !defined HAVE_BGL
+#if 0 || !(defined HAVE_BGL || defined HAVE_BGP)
 #define CORRELATOR_C_IMPLEMENTATION
 #endif
 
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.S b/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.S
index 341ac3671d1..b8608d24550 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.S
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.S
@@ -18,7 +18,7 @@
 #
 #  $Id$
 
-#if defined HAVE_BGL
+#if defined HAVE_BGL || defined HAVE_BGP
 #define BGL_PROCESSING
 #include <CS1_Interface/CS1_Config.h>
 
@@ -218,25 +218,33 @@ loop1:	# loop over time
 
 	bdnz	 loop1
 
+#if !defined HAVE_BGP
 	dcbz	 0,7
+#endif
 	stfpsx	 0,0,7		# store results S0 * ~S2
 	stfpsux	 1,7,11
 	stfpsux	 2,7,11
 	stfpsux	 3,7,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,8
+#endif
 	stfpsx	 4,0,8		# store results S0 * ~S3
 	stfpsux	 5,8,11
 	stfpsux	 6,8,11
 	stfpsux	 7,8,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,9
+#endif
 	stfpsx	 8,0,9		# store results S1 * ~S2
 	stfpsux	 9,9,11
 	stfpsux	 10,9,11
 	stfpsux	 11,9,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,10
+#endif
 	stfpsx	 12,0,10	# store results S1 * ~S3
 	stfpsux	 13,10,11
 	stfpsux	 14,10,11
@@ -389,18 +397,18 @@ loop2:
 	fxcxma   20,24,30,20
 	fxcxma   21,24,31,21
 	fxcxma   22,25,30,22	; lfpsux	 30,7,11
-	fxcxma   23,25,31,23	; lfpsux	 31,7,11
+	fxcxma   23,25,31,23
 
 	# S0 * ~S3, phase 1	; # ld S1
-	fxcpnsma 0,26,28,0	; lfpsux	 24,4,11
-	fxcpnsma 1,26,29,1	; lfpsux	 25,4,11
-	fxcpnsma 2,27,28,2
-	fxcpnsma 3,27,29,3
+	fxcpnsma 0,26,28,0
+	fxcpnsma 1,26,29,1
+	fxcpnsma 2,27,28,2	; lfpsux	 31,7,11
+	fxcpnsma 3,27,29,3	; lfpsux	 24,4,11
 
 	# S0 * ~S4, phase 1
 	fxcpnsma 4,26,30,4
 	fxcpnsma 5,26,31,5
-	fxcpnsma 6,27,30,6
+	fxcpnsma 6,27,30,6	; lfpsux	 25,4,11
 	fxcpnsma 7,27,31,7
 
 	# S0 * ~S4, phase 2
@@ -469,37 +477,49 @@ loop2:
 	lwz	 4,288+12(1)
 	lwz	 5,288+16(1)
 
+#if !defined HAVE_BGP
 	dcbz	 0,8
+#endif
 	stfpsx	 0,0,8		# store results S0 * ~S3
 	stfpsux	 1,8,11
 	stfpsux	 2,8,11
 	stfpsux	 3,8,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,9
+#endif
 	stfpsx	 4,0,9		# store results S0 * ~S4
 	stfpsux	 5,9,11
 	stfpsux	 6,9,11
 	stfpsux	 7,9,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,10
+#endif
 	stfpsx	 8,0,10		# store results S1 * ~S3
 	stfpsux	 9,10,11
 	stfpsux	 10,10,11
 	stfpsux	 11,10,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,3
+#endif
 	stfpsx	 12,0,3		# store results S1 * ~S4
 	stfpsux	 13,3,11
 	stfpsux	 14,3,11
 	stfpsux	 15,3,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,4
+#endif
 	stfpsx	 16,0,4		# store results S2 * ~S3
 	stfpsux	 17,4,11
 	stfpsux	 18,4,11
 	stfpsux	 19,4,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,5
+#endif
 	stfpsx	 20,0,5		# store results S2 * ~S4
 	stfpsux	 21,5,11
 	stfpsux	 22,5,11
@@ -593,10 +613,12 @@ loop4:	# loop over time
 	bdnz	 loop4
 
 	fxcsnsma 0,0,12,0	# f0r += 1*f0i, f0i += -1*f0i = 0
-	fxcsnsma 3,3,12,3
 	fpadd	 1,1,2
+	fxcsnsma 3,3,12,3
 
+#if !defined HAVE_BGP
 	dcbz	 0,4		# store result
+#endif
 	stfpsx	 0,0,4
 	stfpsux	 1,4,8
 	fsneg	 1,1
@@ -733,20 +755,26 @@ loop3:	# loop over time
 	fxcsnsma 8,8,12,8
 	fxcsnsma 11,11,12,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,5
+#endif
 	stfpsx	 0,0,5		# store results S0 * ~S0
 	stfpsux	 1,5,11
 	fsneg	 1,1
 	stfpsux	 1,5,11
 	stfpsux	 3,5,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,6
+#endif
 	stfpsx	 4,0,6		# store results S0 * ~S1
 	stfpsux	 5,6,11
 	stfpsux	 6,6,11
 	stfpsux	 7,6,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,7
+#endif
 	stfpsx	 8,0,7		# store results S1 * ~S1
 	stfpsux	 9,7,11
 	fsneg	 9,9
@@ -972,32 +1000,42 @@ loop7:	# loop over time
 	fxcsnsma 15,15,18,15
 	fxcsnsma 17,17,18,17
 
+#if !defined HAVE_BGP
 	dcbz	 0,6
+#endif
 	stfpsx	 0,0,6		# store results S0 * ~S1
 	stfpsux	 1,6,11
 	stfpsux	 2,6,11
 	stfpsux	 3,6,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,7
+#endif
 	stfpsx	 4,0,7		# store results S0 * ~S2
 	stfpsux	 5,7,11
 	stfpsux	 6,7,11
 	stfpsux	 7,7,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,8
+#endif
 	stfpsx	 8,0,8		# store results S1 * ~S1
 	stfpsux	 9,8,11
 	fsneg	 9,9
 	stfpsux	 9,8,11
 	stfpsux	 10,8,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,9
+#endif
 	stfpsx	 11,0,9		# store results S1 * ~S2
 	stfpsux	 12,9,11
 	stfpsux	 13,9,11
 	stfpsux	 14,9,11
 
+#if !defined HAVE_BGP
 	dcbz	 0,10
+#endif
 	stfpsx	 15,0,10	# store results S2 * ~S2
 	stfpsux	 16,10,11
 	fsneg	 16,16
@@ -1139,7 +1177,19 @@ _add_correlations:
 .align	5
 .global	_clear_correlation
 _clear_correlation:
+#if defined HAVE_BGP
+	li	0,0
+	stw	0,0(3)
+	stw	0,4(3)
+	stw	0,8(3)
+	stw	0,12(3)
+	stw	0,16(3)
+	stw	0,20(3)
+	stw	0,24(3)
+	stw	0,28(3)
+#else
 	dcbz	0,3		# clear the entire cache line, it contains all
+#endif
 	blr			# polarizations
 
 
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.h b/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.h
index 6293a7d2321..0b01c93afde 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.h
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.h
@@ -21,7 +21,7 @@
 #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_CORRELATOR_ASM_H
 #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_CORRELATOR_ASM_H
 
-#if defined HAVE_BGL
+#if defined HAVE_BGL || defined HAVE_BGP
 #include <CS1_Interface/CS1_Config.h>
 
 namespace LOFAR {
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.S b/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.S
index 2ef3f3b5dcc..734479ccc95 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.S
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.S
@@ -1,4 +1,4 @@
-#if defined HAVE_BGL
+#if defined HAVE_BGL || defined HAVE_BGP
 
 _C1:	.long	0x3F800000, 0x3F800000
 _W:	.long	0x3F6C835E, 0xBEC3EF15
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.h b/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.h
index 1b5d4f8fd33..1f45873b97a 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.h
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.h
@@ -21,7 +21,7 @@
 #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_FFT_ASM_H
 #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_FFT_ASM_H
 
-#if defined HAVE_BGL
+#if defined HAVE_BGL || defined HAVE_BGP
 #include <Common/lofar_complex.h>
 
 namespace LOFAR {
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/FIR.h b/Appl/CEP/CS1/CS1_BGLProc/src/FIR.h
index 151d99437cb..8ae73d35218 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/FIR.h
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/FIR.h
@@ -1,7 +1,7 @@
 #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_FIR_H
 #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_FIR_H
 
-#if 0 || !defined HAVE_BGL
+#if 0 || !(defined HAVE_BGL || defined HAVE_BGP)
 #define FIR_C_IMPLEMENTATION
 #endif
 
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S b/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S
index bb2b949c4d4..20283314686 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S
@@ -17,7 +17,7 @@
 #
 #  $Id$
 
-#if defined HAVE_BGL
+#if defined HAVE_BGL || defined HAVE_BGP
 #define BGL_PROCESSING
 #include <CS1_Interface/CS1_Config.h>
 
@@ -93,7 +93,9 @@ L4:
 	lfpsux	2,4,6
 	lfpsux	3,4,6
 
+#if !defined HAVE_BGP
 	dcbz	3,7
+#endif
 	stfpsux	0,3,7
 	stfpsux	1,3,9
 	stfpsux	2,3,9
@@ -164,7 +166,11 @@ L5:
 	fxcxnpma 3,3,10,15	; lfpsux	7,4,7
 
 	fxcxnpma 8,8,9,16
+#if defined HAVE_BGP
+	fxcxnpma 10,10,11,17
+#else
 	fxcxnpma 10,10,11,17	; dcbz		3,6
+#endif
  
 	fxpmul	12,4,8		; stfpsux	0,3,6
 	fxpmul	13,5,8		; stfpsux	1,3,10
@@ -180,7 +186,11 @@ L5:
 	fxcxnpma 7,7,10,15	; lfpsux	3,4,7
 
 	fxcxnpma 8,8,9,16
+#if defined HAVE_BGP
+	fxcxnpma 10,10,11,17
+#else
 	fxcxnpma 10,10,11,17	; dcbz		3,6
+#endif
  
 	fxpmul	12,0,8		; stfpsux	4,3,6
 	fxpmul	13,1,8 		; stfpsux	5,3,10
@@ -1604,9 +1614,29 @@ L8:	cmpi	0,7,NR_SAMPLES_PER_INTEGRATION
 #endif
 
 
+#if defined HAVE_BGP
+zero:	.long	0,0
+#endif
+
 .align 5
 .global _memzero
 _memzero:
+#if defined HAVE_BGP
+	lis	5,zero@ha	; srwi	4,4,7
+	addi	5,5,zero@l	; mtctr	4
+	lfpsx	0,0,5		; li	8,16
+	subi	3,3,16
+
+L1:	stfpdux	0,3,8
+	stfpdux	0,3,8
+	stfpdux	0,3,8
+	stfpdux	0,3,8
+	stfpdux	0,3,8
+	stfpdux	0,3,8
+	stfpdux	0,3,8
+	stfpdux	0,3,8
+	bdnz	L1
+#else
 	srwi	4,4,7
 	mtctr	4
 	li	4,32
@@ -1614,13 +1644,13 @@ _memzero:
 	li	6,96
 	li	7,128
 
-L1:	
-	dcbz	0,3
+L1:	dcbz	0,3
 	dcbz	3,4
 	dcbz	3,5
 	dcbz	3,6
 	add	3,3,7
 	bdnz	L1
+#endif
 	blr
 
 
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h b/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h
index 709cc3d6c99..c40ce8e59ff 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h
@@ -21,10 +21,12 @@
 #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_FIR_ASM_H
 #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_FIR_ASM_H
 
-#if defined HAVE_BGL
+#if defined HAVE_BGL || defined HAVE_BGP
 #include <CS1_Interface/CS1_Config.h>
 
+#if defined HAVE_BGL
 #include <rts.h>
+#endif
 
 namespace LOFAR
 {
@@ -65,7 +67,10 @@ extern "C" {
     unsigned nr_polarizations;
   } _FIR_constants_used;
 
+#if defined HAVE_BGL
   void _bgl_mutex_lock(BGL_Mutex *), _bgl_mutex_unlock(BGL_Mutex *);
+#endif
+
   unsigned long long _rdtsc();
 };
 
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.cc b/Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.cc
new file mode 100644
index 00000000000..60727cf0127
--- /dev/null
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.cc
@@ -0,0 +1,134 @@
+#include <lofar_config.h>
+
+#include <LocationInfo.h>
+
+#include <CS1_Interface/BGL_Mapping.h>
+#include <CS1_Interface/PrintVector.h>
+#include <Transport/TH_MPI.h>
+
+#if defined HAVE_BGP
+#include <common/bgp_personality_inlines.h>
+#include <spi/kernel_interface.h>
+#endif
+
+
+#include <iostream>
+
+
+namespace LOFAR {
+namespace CS1 {
+
+LocationInfo::LocationInfo()
+{
+#if defined HAVE_BGP || defined HAVE_BGL
+  getPersonality();
+#endif
+
+}
+
+
+#if defined HAVE_BGP
+void LocationInfo::getPersonality()
+{
+  if (Kernel_GetPersonality(&itsPersonality, sizeof itsPersonality) != 0) {
+    std::cerr << "could not get personality" << std::endl;
+    exit(1);
+  }
+
+  if (TH_MPI::getCurrentRank() == 0)
+    std::clog << "topology = ("
+	      << BGP_Personality_xSize(&itsPersonality) << ','
+	      << BGP_Personality_ySize(&itsPersonality) << ','
+	      << BGP_Personality_zSize(&itsPersonality) << "), torus wraparound = ("
+	      << (BGP_Personality_isTorusX(&itsPersonality) ? 'T' : 'F') << ','
+	      << (BGP_Personality_isTorusY(&itsPersonality) ? 'T' : 'F') << ','
+	      << (BGP_Personality_isTorusZ(&itsPersonality) ? 'T' : 'F') << ')'
+	      << std::endl;
+
+  itsPsetNumbers.resize(TH_MPI::getNumberOfNodes());
+  itsPsetNumber = BGP_Personality_psetNum(&itsPersonality);
+  itsPsetNumbers[TH_MPI::getCurrentRank()] = itsPsetNumber;
+
+  for (int core = 0; core < TH_MPI::getNumberOfNodes(); core ++)
+    MPI_Bcast(&itsPsetNumbers[core], 1, MPI_INT, core, MPI_COMM_WORLD);
+
+  itsRankInPset = 0;
+
+  for (int rank = 0; rank < TH_MPI::getCurrentRank(); rank ++)
+    if (itsPsetNumbers[rank] == itsPsetNumber)
+      ++ itsRankInPset;
+
+  //usleep(100000 * TH_MPI::getCurrentRank());
+
+  if (TH_MPI::getCurrentRank() == 0) {
+    std::vector<std::vector<unsigned> > cores(BGP_Personality_numIONodes(&itsPersonality));
+
+    for (unsigned rank = 0; rank < itsPsetNumbers.size(); rank ++)
+      cores[itsPsetNumbers[rank]].push_back(rank);
+
+    for (unsigned pset = 0; pset < BGP_Personality_numIONodes(&itsPersonality); pset ++)
+      std::clog << "pset " << pset << " contains cores " << cores[pset] << std::endl;
+  }
+}
+
+#endif
+
+#if defined HAVE_BGL
+void LocationInfo::getPersonality()
+{
+  if (rts_get_personality(&itsPersonality, sizeof(itsPersonality)) != 0) {
+    std::cerr << "could not get personality" << std::endl;
+    exit(1);
+  }
+  
+  if (TH_MPI::getCurrentRank == 0)
+    std::clog << "topology = ("
+	      << itsPersonality.getXsize() << ','
+	      << itsPersonality.getYsize() << ','
+	      << itsPersonality.getZsize() << "), torus wraparound = ("
+	      << (itsPersonality.isTorusX() ? 'T' : 'F') << ','
+	      << (itsPersonality.isTorusY() ? 'T' : 'F') << ','
+	      << (itsPersonality.isTorusZ() ? 'T' : 'F') << ')'
+	      << std::endl;
+  
+  itsPsetNumbers.resize(TH_MPI::getNumberOfNodes());
+  itsPsetNumber = itsPersonality.getPsetNum();
+  itsPsetNumbers[TH_MPI::getCurrentRank()] = itsPsetNumber;
+
+  for (int core = 0; core < TH_MPI::getNumberOfNodes(); core ++)
+    MPI_Bcast(&itsPsetNumbers[core], 1, MPI_INT, core, MPI_COMM_WORLD);
+
+  itsRankInPset = 0;
+
+  for (int rank = 0; rank < TH_MPI::getCurrentRank(); rank ++)
+    if (itsPsetNumbers[rank] == itsPsetNumber)
+      ++ itsRankInPset;
+
+  //usleep(100000 * TH_MPI::getCurrentRank());
+    
+  if (TH_MPI::getCurrentRank() == 0) {
+    std::vector<std::vector<unsigned> > cores(itsPersonality.numIONodes());
+
+    for (unsigned rank = 0; rank < itsPsetNumbers.size(); rank ++)
+      cores[itsPsetNumbers[rank]].push_back(rank);
+
+    for (unsigned pset = 0; pset < itsPersonality.numPsets(); pset ++)
+      std::clog << "LocationInfo :: pset " << pset << " contains cores " << cores[pset] << std::endl;
+  }
+}
+
+#endif
+
+
+unsigned LocationInfo::remapOnTree(unsigned pset, unsigned core) const
+{
+  core = BGL_Mapping::mapCoreOnPset(core, pset);
+
+  for (unsigned rank = 0;; rank ++)
+    if (itsPsetNumbers[rank] == pset && core -- == 0)
+      return rank;
+}
+
+
+} // namespace CS1
+} // namespace LOFAR
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.h b/Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.h
new file mode 100644
index 00000000000..9d60cd265e4
--- /dev/null
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.h
@@ -0,0 +1,67 @@
+#ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_ALLOCATOR_H
+#define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_ALLOCATOR_H
+
+#include <vector>
+
+#if defined HAVE_BGP
+// we do not need mpi.h here, but including it after bgp_personality.h leads
+// to compilation errors
+#define MPICH_IGNORE_CXX_SEEK
+#include <mpi.h>
+
+#include <common/bgp_personality.h>
+#endif
+
+#if defined HAVE_BGL
+#include <bglpersonality.h>
+#include <rts.h>
+#endif
+
+
+namespace LOFAR {
+namespace CS1 {
+
+class LocationInfo
+{
+  public:
+	     LocationInfo();
+
+    unsigned remapOnTree(unsigned pset, unsigned core) const;
+
+    unsigned psetNumber() const;
+    unsigned rankInPset() const;
+
+  private:
+#if defined HAVE_BGP || defined HAVE_BGL
+    void getPersonality();
+#endif
+
+#if defined HAVE_BGP
+    _BGP_Personality_t    itsPersonality;
+    std::vector<unsigned> itsPsetNumbers;
+    unsigned		  itsPsetNumber, itsRankInPset;
+#endif
+
+#if defined HAVE_BGL
+    BGLPersonality        itsPersonality;
+    std::vector<unsigned>  itsPsetNumbers;
+    unsigned               itsPsetNumber, itsRankInPset;
+#endif
+};
+
+
+inline unsigned LocationInfo::psetNumber() const
+{
+  return itsPsetNumber;
+}
+
+
+inline unsigned LocationInfo::rankInPset() const
+{
+  return itsRankInPset;
+}
+
+} // namespace CS1
+} // namespace LOFAR
+
+#endif
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Makefile.am b/Appl/CEP/CS1/CS1_BGLProc/src/Makefile.am
index 2bd9b6f094b..108ab8d887a 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/Makefile.am
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/Makefile.am
@@ -1,6 +1,6 @@
 INSTHDRS         	=\
 Package__Version.h      \
-Allocator.h		\
+LocationInfo.h		\
 CorrelatorAsm.h		\
 FFT_Asm.h		\
 FIR_Asm.h		\
@@ -34,7 +34,7 @@ CCASFLAGS		= $(patsubst -q%,,$(CPPFLAGS)) $(EXTRA_CPPFLAGS)
 
 CS1_BGL_Processing_SOURCES	= $(DOCHDRS) \
 Package__Version.cc             \
-Allocator.cc			\
+LocationInfo.cc			\
 CorrelatorAsm.S			\
 FIR_Asm.S			\
 FFT_Asm.S			\
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc b/Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc
index 40f2e825dcb..de4ea11b9f7 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc
@@ -197,7 +197,7 @@ void PPF::filter(double centerFrequency, const TransposedData *transposedData, F
 #endif
 
 #if defined PPF_C_IMPLEMENTATION
-    fcomplex fftOutData[NR_SUBBAND_CHANNELS];
+    fcomplex fftOutData[NR_SUBBAND_CHANNELS] __attribute__ ((aligned(sizeof(fcomplex))));
 
     FIRtimer.start();
     for (unsigned pol = 0; pol < NR_POLARIZATIONS; pol ++) {
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/PPF.h b/Appl/CEP/CS1/CS1_BGLProc/src/PPF.h
index 81831cafebc..c536655bb59 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/PPF.h
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/PPF.h
@@ -1,7 +1,7 @@
 #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_PPF_H
 #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_PPF_H
 
-#if 0 || !defined HAVE_BGL
+#if 0 || !(defined HAVE_BGL || defined HAVE_BGP)
 #define PPF_C_IMPLEMENTATION
 #endif
 
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.cc b/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.cc
index 42745052ff3..ca075177b38 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.cc
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.cc
@@ -10,6 +10,9 @@
 
 #if defined HAVE_BGL
 #include <rts.h>
+//#elif defined HAVE_BGP
+//#include <common/bgp_personality_inlines.h>
+//#include <spi/kernel_interface.h>
 #endif
 
 #include <cassert>
@@ -44,48 +47,68 @@ Transpose::~Transpose()
 }
 
 
-#if defined HAVE_BGL
+// #if defined HAVE_BGL
 
-unsigned Transpose::remapOnTree(unsigned pset, unsigned core, const struct BGLPersonality &personality)
-{
-  unsigned psetXsize  = personality.getXpsetSize();
-  unsigned psetYsize  = personality.getYpsetSize();
-  unsigned psetZsize  = personality.getZpsetSize();
+// unsigned Transpose::remapOnTree(unsigned pset, unsigned core, const struct BGLPersonality &personality)
+// {
+//   unsigned psetXsize  = personality.getXpsetSize();
+//   unsigned psetYsize  = personality.getYpsetSize();
+//   unsigned psetZsize  = personality.getZpsetSize();
 
-  unsigned psetXcount = personality.getXsize() / psetXsize;
-  unsigned psetYcount = personality.getYsize() / psetYsize;
-  unsigned psetZcount = personality.getZsize() / psetZsize;
+//   unsigned psetXcount = personality.getXsize() / psetXsize;
+//   unsigned psetYcount = personality.getYsize() / psetYsize;
+//   unsigned psetZcount = personality.getZsize() / psetZsize;
 
-  unsigned xOrigin    = pset			       % psetXcount * psetXsize;
-  unsigned yOrigin    = pset / psetXcount	       % psetYcount * psetYsize;
-  unsigned zOrigin    = pset / psetXcount / psetYcount % psetZcount * psetZsize;
+//   unsigned xOrigin    = pset			       % psetXcount * psetXsize;
+//   unsigned yOrigin    = pset / psetXcount	       % psetYcount * psetYsize;
+//   unsigned zOrigin    = pset / psetXcount / psetYcount % psetZcount * psetZsize;
 
-  unsigned nodesPerPset = personality.numNodesInPset();
+//   unsigned nodesPerPset = personality.numNodesInPset();
 
-  unsigned numProcs, xOffset, yOffset, zOffset, node;
+//   unsigned numProcs, xOffset, yOffset, zOffset, node;
 
-  core = BGL_Mapping::mapCoreOnPset(core, pset);
-  personality.coordsForPsetRank(core % nodesPerPset, xOffset, yOffset, zOffset);
+//   core = BGL_Mapping::mapCoreOnPset(core, pset);
+//   personality.coordsForPsetRank(core % nodesPerPset, xOffset, yOffset, zOffset);
 
-  unsigned x = xOrigin + xOffset - personality.xPsetOrigin();
-  unsigned y = yOrigin + yOffset - personality.yPsetOrigin();
-  unsigned z = zOrigin + zOffset - personality.zPsetOrigin();
-  unsigned t = core / nodesPerPset;
+//   unsigned x = xOrigin + xOffset - personality.xPsetOrigin();
+//   unsigned y = yOrigin + yOffset - personality.yPsetOrigin();
+//   unsigned z = zOrigin + zOffset - personality.zPsetOrigin();
+//   unsigned t = core / nodesPerPset;
 
-  rts_rankForCoordinates(x, y, z, t, &node, &numProcs);
+//   rts_rankForCoordinates(x, y, z, t, &node, &numProcs);
 
-#if defined HAVE_MPI
-  if (node >= (unsigned) TH_MPI::getNumberOfNodes()) {
-    std::cerr << "not enough nodes allocated (node = " << node << ", TH_MPI::getNumberOfNodes() = " << TH_MPI::getNumberOfNodes() << std::endl;
-    exit(1);
-  }
-#endif
+// #if defined HAVE_MPI
+//   if (node >= (unsigned) TH_MPI::getNumberOfNodes()) {
+//     std::cerr << "not enough nodes allocated (node = " << node << ", TH_MPI::getNumberOfNodes() = " << TH_MPI::getNumberOfNodes() << std::endl;
+//     exit(1);
+//   }
+// #endif
+
+//   return node;
+// }
 
-  return node;
+// #elif defined HAVE_BGP
+#if defined HAVE_BGL || defined HAVE_BGP
+
+unsigned Transpose::remapOnTree(unsigned pset, unsigned core, const std::vector<unsigned> &psetNumbers)
+{
+  core = BGL_Mapping::mapCoreOnPset(core, pset);
+
+  for (unsigned rank = 0;; rank ++)
+    if (psetNumbers[rank] == pset && core -- == 0)
+      return rank;
 }
 
+#endif
+
 
-void Transpose::getMPIgroups(unsigned nrCoresPerPset, const struct BGLPersonality &personality, const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets)
+// #if defined HAVE_BGL
+// void Transpose::getMPIgroups(unsigned nrCoresPerPset, const struct BGLPersonality &personality, const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets)
+// #elif defined HAVE_BGP
+#if defined HAVE_BGL || defined HAVE_BGP
+void Transpose::getMPIgroups(unsigned nrCoresPerPset, const LocationInfo &locationInfo, const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets)
+#endif
+#if defined HAVE_BGL || defined HAVE_BGP
 {
   allTransposeGroups.resize(nrCoresPerPset);
 
@@ -105,10 +128,14 @@ void Transpose::getMPIgroups(unsigned nrCoresPerPset, const struct BGLPersonalit
     std::vector<int> ranks;
 
     for (std::set<unsigned>::const_iterator pset = psets.begin(); pset != psets.end(); pset ++)
+#if 0 // defined HAVE_BGL
       ranks.push_back(remapOnTree(*pset, core, personality));
+#else
+      ranks.push_back(locationInfo.remapOnTree(*pset, core));
+#endif
 
     if (TH_MPI::getCurrentRank() == 0)
-      std::clog << "group " << core << " contains cores " << ranks << std::endl;
+      std::clog << "Transpose :: group " << core << " contains cores " << ranks << std::endl;
 
     if (MPI_Group_incl(all, ranks.size(), &ranks[0], &group) != MPI_SUCCESS) {
       std::cerr << "MPI_Group_incl() failed" << std::endl;
diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.h b/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.h
index 222ca7f11c2..6d12bab68d9 100644
--- a/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.h
+++ b/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.h
@@ -1,19 +1,22 @@
+
 #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_TRANSPOSE_H
 #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_TRANSPOSE_H
 
 #include <InputData.h>
+#include <LocationInfo.h>
 #include <TransposedData.h>
 
 #include <boost/multi_array.hpp>
 
-#if defined HAVE_BGL
-#include <bglpersonality.h>
-#endif
-
 #if defined HAVE_MPI
+#define MPICH_IGNORE_CXX_SEEK
 #include <mpi.h>
 #endif
 
+#if defined HAVE_BGL
+#include <bglpersonality.h>
+#endif
+
 #include <vector>
 
 
@@ -28,10 +31,14 @@ class Transpose {
     ~Transpose();
 
     void setupTransposeParams(const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets, InputData *, TransposedData *);
-    static void	getMPIgroups(unsigned nrCoresPerPset, const BGLPersonality &, const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets);
 
-#if defined HAVE_BGL
-    static unsigned remapOnTree(unsigned pset, unsigned core, const struct BGLPersonality &);
+/* #if defined HAVE_BGL */
+/*     static void	getMPIgroups(unsigned nrCoresPerPset, const BGLPersonality &, const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets); */
+/*     static unsigned remapOnTree(unsigned pset, unsigned core, const struct BGLPersonality &); */
+/* #elif defined HAVE_BGP */
+#if defined HAVE_BGL || HAVE_BGP
+    static void	getMPIgroups(unsigned nrCoresPerPset, const LocationInfo &, const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets);
+    static unsigned remapOnTree(unsigned pset, unsigned core, const std::vector<unsigned> &psetNumbers);
 #endif
 
     void transpose(const InputData *, TransposedData *);
-- 
GitLab