From 562e9e00465ed2fd0dd9836e7c45b466683dbfe0 Mon Sep 17 00:00:00 2001 From: Martin Gels <gels@astron.nl> Date: Tue, 24 Jun 2008 07:41:42 +0000 Subject: [PATCH] Bug 1005: made CS1_IONProc compatible for BGP --- Appl/CEP/CS1/CS1_BGLProc/src/Allocator.cc | 60 -------- Appl/CEP/CS1/CS1_BGLProc/src/Allocator.h | 39 ----- .../CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc | 142 +++++------------- Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.h | 10 +- .../src/CS1_BGL_Processing_main.cc | 30 +++- .../CS1_BGLProc/src/CacheAlignedAllocator.h | 2 +- Appl/CEP/CS1/CS1_BGLProc/src/Correlator.cc | 5 +- Appl/CEP/CS1/CS1_BGLProc/src/Correlator.h | 2 +- Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.S | 66 +++++++- Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.h | 2 +- Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.S | 2 +- Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.h | 2 +- Appl/CEP/CS1/CS1_BGLProc/src/FIR.h | 2 +- Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S | 36 ++++- Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h | 7 +- Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.cc | 134 +++++++++++++++++ Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.h | 67 +++++++++ Appl/CEP/CS1/CS1_BGLProc/src/Makefile.am | 4 +- Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc | 2 +- Appl/CEP/CS1/CS1_BGLProc/src/PPF.h | 2 +- Appl/CEP/CS1/CS1_BGLProc/src/Transpose.cc | 87 +++++++---- Appl/CEP/CS1/CS1_BGLProc/src/Transpose.h | 21 ++- 22 files changed, 451 insertions(+), 273 deletions(-) delete mode 100644 Appl/CEP/CS1/CS1_BGLProc/src/Allocator.cc delete mode 100644 Appl/CEP/CS1/CS1_BGLProc/src/Allocator.h create mode 100644 Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.cc create mode 100644 Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.h diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Allocator.cc b/Appl/CEP/CS1/CS1_BGLProc/src/Allocator.cc deleted file mode 100644 index 529096b6194..00000000000 --- a/Appl/CEP/CS1/CS1_BGLProc/src/Allocator.cc +++ /dev/null @@ -1,60 +0,0 @@ -#include <lofar_config.h> - -#include <Allocator.h> - -#include <malloc.h> - - -namespace LOFAR { -namespace CS1 { - -Heap::Heap(size_t heapSize, int alignment) -{ - size = heapSize; - - if (posix_memalign(&start, alignment, heapSize) != 0) { - std::cerr << "could not allocate heap" << std::endl; - exit(1); - } -} - - -Heap::~Heap() -{ - free(start); -} - - -Overlay::Overlay(const Heap &heap) -{ - - freeList.include(heap.start, (void *) ((char *) heap.start + heap.size)); -} - - -void *Overlay::allocate(size_t size, int alignment) -{ - for (SparseSet<void *>::const_iterator it = freeList.getRanges().begin(); it != freeList.getRanges().end(); it ++) { - void *begin = (void *) (((size_t) it->begin + alignment - 1) & ~(alignment - 1)); - - if ((char *) it->end - (char *) begin >= (ptrdiff_t) size) { - freeList.exclude(begin, (void *) ((char *) begin + size)); - sizes[begin] = size; - return begin; - } - } - - std::cerr << "could not allocate memory from heap" << std::endl; - std::exit(1); -} - - -void Overlay::deallocate(void *ptr) -{ - std::map<void *, size_t>::iterator index = sizes.find(ptr); - freeList.include(ptr, (void *) ((char *) ptr + index->second)); - sizes.erase(index); -} - -} // namespace CS1 -} // namespace LOFAR diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Allocator.h b/Appl/CEP/CS1/CS1_BGLProc/src/Allocator.h deleted file mode 100644 index 720712ed22c..00000000000 --- a/Appl/CEP/CS1/CS1_BGLProc/src/Allocator.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_ALLOCATOR_H -#define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_ALLOCATOR_H - -#include <CS1_Interface/SparseSet.h> -#include <map> - -namespace LOFAR { -namespace CS1 { - -class Heap -{ - public: - Heap(size_t heapSize, int alignment); - ~Heap(); - - private: - friend class Overlay; - void *start; - size_t size; -}; - - -class Overlay -{ - public: - Overlay(const Heap &); - - void *allocate(size_t size, int alignment); - void deallocate(void *ptr); - - private: - SparseSet<void *> freeList; - std::map<void *, size_t> sizes; -}; - -} // namespace CS1 -} // namespace LOFAR - -#endif diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc b/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc index b48d087ae54..04c23602d59 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc +++ b/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.cc @@ -30,7 +30,6 @@ #include <Transport/TH_MPI.h> #include <CS1_Interface/BGL_Configuration.h> #include <CS1_Interface/BGL_Mapping.h> -#include <CS1_Interface/PrintVector.h> #include <cassert> #include <complex> @@ -39,8 +38,12 @@ #include <iostream> #include <map> +#if defined HAVE_BGP +#include <common/bgp_personality_inlines.h> +#include <spi/kernel_interface.h> +#endif -#if defined HAVE_ZOID && defined HAVE_BGL +#if defined HAVE_ZOID && (defined HAVE_BGL || defined HAVE_BGP) extern "C" { #include <lofar.h> } @@ -48,7 +51,11 @@ extern "C" { #endif #if defined HAVE_MPI +#if defined HAVE_BGP || defined HAVE_BGL +#define LOG_CONDITION (itsCurrentSubband == itsFirstSubband) +#else #define LOG_CONDITION (itsRankInPset == 0) +#endif //#define LOG_CONDITION (TH_MPI::getCurrentRank() == 0) #else #define LOG_CONDITION 1 @@ -73,14 +80,15 @@ static NSTimer computeTimer("computing", true); char **BGL_Processing::original_argv; -BGL_Processing::BGL_Processing(TransportHolder *th) +BGL_Processing::BGL_Processing(TransportHolder *th, const LocationInfo &locationInfo) : itsTransportHolder(th), + itsLocationInfo(locationInfo), itsInputData(0), itsTransposedData(0), itsFilteredData(0), itsCorrelatedData(0), -#if defined HAVE_BGL +#if defined HAVE_BGL || defined HAVE_BGP itsTranspose(0), #endif itsPPF(0), @@ -88,11 +96,11 @@ BGL_Processing::BGL_Processing(TransportHolder *th) { memset(itsArenas, 0, sizeof itsArenas); -#if defined HAVE_BGL - getPersonality(); -#endif +// #if defined HAVE_BGL +// getPersonality(); +// #endif -#if defined HAVE_ZOID && defined HAVE_BGL +#if defined HAVE_ZOID && (defined HAVE_BGL || defined HAVE_BGP) initIONode(); #endif } @@ -103,7 +111,8 @@ BGL_Processing::~BGL_Processing() } -#if defined HAVE_BGL +#if 0 + //#if defined HAVE_BGL struct Location { unsigned pset, rankInPset; @@ -142,21 +151,21 @@ void BGL_Processing::getPersonality() for (unsigned rank = 0; rank < allLocations.size(); rank ++) cores[allLocations[rank].pset][allLocations[rank].rankInPset] = rank; - for (unsigned pset = 0; pset < itsPersonality.numPsets(); pset ++) - std::clog << "pset " << pset << " contains cores " << cores[pset] << std::endl; +// for (unsigned pset = 0; pset < itsPersonality.numPsets(); pset ++) +// std::clog << "pset " << pset << " contains cores " << cores[pset] << std::endl; } } #endif -#if defined HAVE_ZOID && defined HAVE_BGL +#if defined HAVE_ZOID && (defined HAVE_BGL || defined HAVE_BGP) void BGL_Processing::initIONode() const { // one of the compute cores in each Pset has to initialize its I/O node - if (itsRankInPset == 0) { + if (itsLocationInfo.rankInPset() == 0) { std::vector<size_t> lengths; for (int arg = 0; original_argv[arg] != 0; arg ++) { @@ -226,96 +235,18 @@ void BGL_Processing::printSubbandList() const #endif - -#if 0 -void BGL_Processing::preprocess(CS1_Parset *parset) -{ - checkConsistency(parset); - -#if defined HAVE_BGL - unsigned usedCoresPerPset = parset->nrCoresPerPset(); - unsigned myPset = itsPersonality.getPsetNum(); - unsigned myCore = BGL_Mapping::reverseMapCoreOnPset(itsRankInPset, myPset); -#else - unsigned usedCoresPerPset = 1; - unsigned myPset = 0; - unsigned myCore = 0; -#endif - - vector<unsigned> inputPsets = parset->getUint32Vector("OLAP.BGLProc.inputPsets"); - vector<unsigned> outputPsets = parset->getUint32Vector("OLAP.BGLProc.outputPsets"); - -#if defined HAVE_BGL - Transpose::getMPIgroups(usedCoresPerPset, itsPersonality, inputPsets, outputPsets); -#endif - - vector<unsigned>::const_iterator inputPsetIndex = std::find(inputPsets.begin(), inputPsets.end(), myPset); - vector<unsigned>::const_iterator outputPsetIndex = std::find(outputPsets.begin(), outputPsets.end(), myPset); - - itsIsTransposeInput = inputPsetIndex != inputPsets.end(); - itsIsTransposeOutput = outputPsetIndex != outputPsets.end(); - - unsigned nrStations = parset->nrStations(); - unsigned nrBaselines = nrStations * (nrStations + 1) / 2; - unsigned nrSamplesPerIntegration = parset->BGLintegrationSteps(); - unsigned nrSamplesToBGLProc = parset->nrSamplesToBGLProc(); - - size_t inputDataSize = itsIsTransposeInput ? InputData::requiredSize(outputPsets.size(), nrSamplesToBGLProc) : 0; - size_t transposedDataSize = itsIsTransposeOutput ? TransposedData::requiredSize(nrStations, nrSamplesToBGLProc) : 0; - size_t filteredDataSize = itsIsTransposeOutput ? FilteredData::requiredSize(nrStations, nrSamplesPerIntegration) : 0; - size_t correlatedDataSize = itsIsTransposeOutput ? CorrelatedData::requiredSize(nrBaselines) : 0; - - itsHeaps[0] = new Heap(std::max(inputDataSize, filteredDataSize), 32); - itsHeaps[1] = new Heap(std::max(transposedDataSize, correlatedDataSize), 32); - - if (itsIsTransposeInput) { - itsInputData = new InputData(*itsHeaps[0], outputPsets.size(), nrSamplesToBGLProc); - } - - if (itsIsTransposeOutput) { - // FIXME: !useGather not implemented - ASSERT(parset->getBool("OLAP.IONProc.useGather")); - - unsigned nrSubbandsPerPset = parset->nrSubbandsPerPset(); - unsigned logicalNode = usedCoresPerPset * (outputPsetIndex - outputPsets.begin()) + myCore; - // TODO: logicalNode assumes output psets are consecutively numbered - - itsCenterFrequencies = parset->refFreqs(); - itsFirstSubband = (logicalNode / usedCoresPerPset) * nrSubbandsPerPset; - itsLastSubband = itsFirstSubband + nrSubbandsPerPset; - itsCurrentSubband = itsFirstSubband + logicalNode % usedCoresPerPset % nrSubbandsPerPset; - itsSubbandIncrement = usedCoresPerPset % nrSubbandsPerPset; - -#if defined HAVE_MPI - printSubbandList(); -#endif - - itsTransposedData = new TransposedData(*itsHeaps[1], nrStations, nrSamplesToBGLProc); - itsFilteredData = new FilteredData(*itsHeaps[0], nrStations, nrSamplesPerIntegration); - itsCorrelatedData = new CorrelatedData(*itsHeaps[1], nrBaselines); - - itsPPF = new PPF(nrStations, nrSamplesPerIntegration, parset->sampleRate() / NR_SUBBAND_CHANNELS, parset->getBool("OLAP.delayCompensation")); - itsCorrelator = new Correlator(nrStations, nrSamplesPerIntegration); - } - -#if defined HAVE_MPI - if (itsIsTransposeInput || itsIsTransposeOutput) { - itsTranspose = new Transpose(itsIsTransposeInput, itsIsTransposeOutput, myCore, nrStations); - itsTranspose->setupTransposeParams(inputPsets, outputPsets, itsInputData, itsTransposedData); - } -#endif -} - -#else - void BGL_Processing::preprocess(BGL_Configuration &configuration) { //checkConsistency(parset); TODO -#if defined HAVE_BGL +// #if defined HAVE_BGL +// unsigned usedCoresPerPset = configuration.nrUsedCoresPerPset(); +// unsigned myPset = itsPersonality.getPsetNum(); +// unsigned myCore = BGL_Mapping::reverseMapCoreOnPset(itsRankInPset, myPset); +#if defined HAVE_BGL || HAVE_BGP unsigned usedCoresPerPset = configuration.nrUsedCoresPerPset(); - unsigned myPset = itsPersonality.getPsetNum(); - unsigned myCore = BGL_Mapping::reverseMapCoreOnPset(itsRankInPset, myPset); + unsigned myPset = itsLocationInfo.psetNumber(); + unsigned myCore = BGL_Mapping::reverseMapCoreOnPset(itsLocationInfo.rankInPset(), myPset); #else unsigned usedCoresPerPset = 1; unsigned myPset = 0; @@ -324,8 +255,10 @@ void BGL_Processing::preprocess(BGL_Configuration &configuration) std::vector<unsigned> &inputPsets = configuration.inputPsets(); std::vector<unsigned> &outputPsets = configuration.outputPsets(); -#if defined HAVE_BGL - Transpose::getMPIgroups(usedCoresPerPset, itsPersonality, inputPsets, outputPsets); +// #if defined HAVE_BGL +// Transpose::getMPIgroups(usedCoresPerPset, itsPersonality, inputPsets, outputPsets); +#if defined HAVE_BGP || defined HAVE_BGL + Transpose::getMPIgroups(usedCoresPerPset, itsLocationInfo, inputPsets, outputPsets); #endif std::vector<unsigned>::const_iterator inputPsetIndex = std::find(inputPsets.begin(), inputPsets.end(), myPset); @@ -397,13 +330,13 @@ void BGL_Processing::preprocess(BGL_Configuration &configuration) } #endif } -#endif void BGL_Processing::process() { NSTimer totalTimer("total", LOG_CONDITION); totalTimer.start(); + if (itsIsTransposeInput) { #if defined HAVE_MPI if (LOG_CONDITION) @@ -446,9 +379,6 @@ MPI_Barrier(itsTransposeGroup); itsCorrelator->computeFlagsAndCentroids(itsFilteredData, itsCorrelatedData); itsCorrelator->correlate(itsFilteredData, itsCorrelatedData); - if ((itsCurrentSubband += itsSubbandIncrement) >= itsLastSubband) - itsCurrentSubband -= itsLastSubband - itsFirstSubband; - computeTimer.stop(); #if defined HAVE_MPI @@ -475,6 +405,10 @@ MPI_Barrier(itsTransposeGroup); for (double time = MPI_Wtime() + 4.0; MPI_Wtime() < time;) ; #endif + + if ((itsCurrentSubband += itsSubbandIncrement) >= itsLastSubband) + itsCurrentSubband -= itsLastSubband - itsFirstSubband; + totalTimer.stop(); } diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.h b/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.h index 23152837bf8..fd4cb500602 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.h +++ b/Appl/CEP/CS1/CS1_BGLProc/src/BGL_Processing.h @@ -21,7 +21,7 @@ #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_BGL_PROCESSING_H #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_BGL_PROCESSING_H -#if 0 || !defined HAVE_BGL +#if 0 || !(defined HAVE_BGL || defined HAVE_BGP) #define C_IMPLEMENTATION #endif @@ -34,6 +34,7 @@ #endif #include <CS1_Interface/Allocator.h> + #include <InputData.h> #include <FilteredData.h> #include <TransposedData.h> @@ -43,9 +44,7 @@ #include <PPF.h> #include <Correlator.h> -#if defined HAVE_BGL -#include <mpi.h> -#endif +#include <LocationInfo.h> #if defined HAVE_BGL #include <bglpersonality.h> @@ -60,7 +59,7 @@ namespace CS1 { class BGL_Processing { public: - BGL_Processing(TransportHolder *th); + BGL_Processing(TransportHolder *, const LocationInfo &); ~BGL_Processing(); #if 0 @@ -91,6 +90,7 @@ class BGL_Processing { #endif TransportHolder *itsTransportHolder; + const LocationInfo &itsLocationInfo; std::vector<double> itsCenterFrequencies; unsigned itsFirstSubband, itsCurrentSubband, itsLastSubband, itsSubbandIncrement; bool itsIsTransposeInput, itsIsTransposeOutput; diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/CS1_BGL_Processing_main.cc b/Appl/CEP/CS1/CS1_BGLProc/src/CS1_BGL_Processing_main.cc index b95c7cee955..510a5d685d7 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/CS1_BGL_Processing_main.cc +++ b/Appl/CEP/CS1/CS1_BGLProc/src/CS1_BGL_Processing_main.cc @@ -26,9 +26,12 @@ #include <Common/Exception.h> #include <CS1_Interface/BGL_Command.h> #include <CS1_Interface/BGL_Configuration.h> +#include <Transport/TH_File.h> #include <Transport/TH_Null.h> +#include <Transport/TH_Socket.h> #include <CS1_BGLProc/TH_ZoidClient.h> #endif +#include <CS1_BGLProc/LocationInfo.h> #include <CS1_BGLProc/BGL_Processing.h> #include <CS1_BGLProc/Package__Version.h> #include <Transport/TH_MPI.h> @@ -40,6 +43,8 @@ using namespace LOFAR::CS1; int main(int argc, char **argv) { + std::clog.rdbuf(std::cout.rdbuf()); + try { BGL_Processing::original_argv = argv; @@ -53,18 +58,34 @@ int main(int argc, char **argv) Version::show<CS1_BGLProcVersion> (std::cout, "CS1_BGLProc", type); } + LocationInfo locationInfo; + #if defined HAVE_ZOID && defined HAVE_BGL - TH_ZoidClient th; + TH_ZoidClient th; +#elif 0 + TH_Null th; +#elif 0 + usleep(10000 * locationInfo.rankInPset()); // do not connect all at the same time + + TH_Socket th("127.0.0.1", boost::lexical_cast<string>(5000 + locationInfo.rankInPset())); + + while (!th.init()) + sleep(1); #else - TH_Null th; + TH_File th(string("/tmp/sock.") + boost::lexical_cast<string>(locationInfo.rankInPset()), TH_File::Read); + + while (!th.init()) + sleep(1); #endif - BGL_Processing proc(&th); - BGL_Command command; + BGL_Processing proc(&th, locationInfo); + BGL_Command command; do { +std::clog << TH_MPI::getCurrentRank() << " read command" << std::endl; command.read(&th); +std::clog << TH_MPI::getCurrentRank() << " received command " << (unsigned) command.value() << std::endl; switch (command.value()) { case BGL_Command::PREPROCESS : { BGL_Configuration configuration; @@ -82,6 +103,7 @@ int main(int argc, char **argv) default : break; } +std::clog << TH_MPI::getCurrentRank() << " command handled" << std::endl; } while (command.value() != BGL_Command::STOP); #if defined HAVE_MPI diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/CacheAlignedAllocator.h b/Appl/CEP/CS1/CS1_BGLProc/src/CacheAlignedAllocator.h index 71898996683..ebff5e89ef0 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/CacheAlignedAllocator.h +++ b/Appl/CEP/CS1/CS1_BGLProc/src/CacheAlignedAllocator.h @@ -5,7 +5,7 @@ #include <memory> -#if defined HAVE_BGL +#if defined HAVE_BGL || defined HAVE_BGP #define CACHE_LINE_SIZE 32 #define CACHE_ALIGNED __attribute__ ((aligned(CACHE_LINE_SIZE))) #else diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.cc b/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.cc index 7e623cf2706..2656c6fffa7 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.cc +++ b/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.cc @@ -190,6 +190,7 @@ void Correlator::correlate(const FilteredData *filteredData, CorrelatedData *cor for (unsigned stat2 = nrValidStations % 2 ? 1 : 2; stat2 < nrValidStations; stat2 += 2) { unsigned stat1 = 0; +#if defined HAVE_BGL // do as many 3x2 blocks as possible for (; stat1 + 3 <= stat2; stat1 += 3) { unsigned stat10 = map[stat1], stat11 = map[stat1+1], stat12 = map[stat1+2]; @@ -208,9 +209,10 @@ void Correlator::correlate(const FilteredData *filteredData, CorrelatedData *cor correlatedData->visibilities[baseline(stat12, stat21)][ch].origin(), itsNrSamplesPerIntegration); } +#endif // see if a 2x2 block is necessary - if (stat1 + 2 <= stat2) { + for (; stat1 + 2 <= stat2; stat1 += 2) { unsigned stat10 = map[stat1], stat11 = map[stat1+1]; unsigned stat20 = map[stat2], stat21 = map[stat2+1]; @@ -223,7 +225,6 @@ void Correlator::correlate(const FilteredData *filteredData, CorrelatedData *cor correlatedData->visibilities[baseline(stat11, stat20)][ch].origin(), correlatedData->visibilities[baseline(stat11, stat21)][ch].origin(), itsNrSamplesPerIntegration); - stat1 += 2; } // do the remaining (auto)correlations near the diagonal diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.h b/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.h index 730b1d9de70..0ebae44d791 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.h +++ b/Appl/CEP/CS1/CS1_BGLProc/src/Correlator.h @@ -1,7 +1,7 @@ #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_CORRELATOR_H #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_CORRELATOR_H -#if 0 || !defined HAVE_BGL +#if 0 || !(defined HAVE_BGL || defined HAVE_BGP) #define CORRELATOR_C_IMPLEMENTATION #endif diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.S b/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.S index 341ac3671d1..b8608d24550 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.S +++ b/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.S @@ -18,7 +18,7 @@ # # $Id$ -#if defined HAVE_BGL +#if defined HAVE_BGL || defined HAVE_BGP #define BGL_PROCESSING #include <CS1_Interface/CS1_Config.h> @@ -218,25 +218,33 @@ loop1: # loop over time bdnz loop1 +#if !defined HAVE_BGP dcbz 0,7 +#endif stfpsx 0,0,7 # store results S0 * ~S2 stfpsux 1,7,11 stfpsux 2,7,11 stfpsux 3,7,11 +#if !defined HAVE_BGP dcbz 0,8 +#endif stfpsx 4,0,8 # store results S0 * ~S3 stfpsux 5,8,11 stfpsux 6,8,11 stfpsux 7,8,11 +#if !defined HAVE_BGP dcbz 0,9 +#endif stfpsx 8,0,9 # store results S1 * ~S2 stfpsux 9,9,11 stfpsux 10,9,11 stfpsux 11,9,11 +#if !defined HAVE_BGP dcbz 0,10 +#endif stfpsx 12,0,10 # store results S1 * ~S3 stfpsux 13,10,11 stfpsux 14,10,11 @@ -389,18 +397,18 @@ loop2: fxcxma 20,24,30,20 fxcxma 21,24,31,21 fxcxma 22,25,30,22 ; lfpsux 30,7,11 - fxcxma 23,25,31,23 ; lfpsux 31,7,11 + fxcxma 23,25,31,23 # S0 * ~S3, phase 1 ; # ld S1 - fxcpnsma 0,26,28,0 ; lfpsux 24,4,11 - fxcpnsma 1,26,29,1 ; lfpsux 25,4,11 - fxcpnsma 2,27,28,2 - fxcpnsma 3,27,29,3 + fxcpnsma 0,26,28,0 + fxcpnsma 1,26,29,1 + fxcpnsma 2,27,28,2 ; lfpsux 31,7,11 + fxcpnsma 3,27,29,3 ; lfpsux 24,4,11 # S0 * ~S4, phase 1 fxcpnsma 4,26,30,4 fxcpnsma 5,26,31,5 - fxcpnsma 6,27,30,6 + fxcpnsma 6,27,30,6 ; lfpsux 25,4,11 fxcpnsma 7,27,31,7 # S0 * ~S4, phase 2 @@ -469,37 +477,49 @@ loop2: lwz 4,288+12(1) lwz 5,288+16(1) +#if !defined HAVE_BGP dcbz 0,8 +#endif stfpsx 0,0,8 # store results S0 * ~S3 stfpsux 1,8,11 stfpsux 2,8,11 stfpsux 3,8,11 +#if !defined HAVE_BGP dcbz 0,9 +#endif stfpsx 4,0,9 # store results S0 * ~S4 stfpsux 5,9,11 stfpsux 6,9,11 stfpsux 7,9,11 +#if !defined HAVE_BGP dcbz 0,10 +#endif stfpsx 8,0,10 # store results S1 * ~S3 stfpsux 9,10,11 stfpsux 10,10,11 stfpsux 11,10,11 +#if !defined HAVE_BGP dcbz 0,3 +#endif stfpsx 12,0,3 # store results S1 * ~S4 stfpsux 13,3,11 stfpsux 14,3,11 stfpsux 15,3,11 +#if !defined HAVE_BGP dcbz 0,4 +#endif stfpsx 16,0,4 # store results S2 * ~S3 stfpsux 17,4,11 stfpsux 18,4,11 stfpsux 19,4,11 +#if !defined HAVE_BGP dcbz 0,5 +#endif stfpsx 20,0,5 # store results S2 * ~S4 stfpsux 21,5,11 stfpsux 22,5,11 @@ -593,10 +613,12 @@ loop4: # loop over time bdnz loop4 fxcsnsma 0,0,12,0 # f0r += 1*f0i, f0i += -1*f0i = 0 - fxcsnsma 3,3,12,3 fpadd 1,1,2 + fxcsnsma 3,3,12,3 +#if !defined HAVE_BGP dcbz 0,4 # store result +#endif stfpsx 0,0,4 stfpsux 1,4,8 fsneg 1,1 @@ -733,20 +755,26 @@ loop3: # loop over time fxcsnsma 8,8,12,8 fxcsnsma 11,11,12,11 +#if !defined HAVE_BGP dcbz 0,5 +#endif stfpsx 0,0,5 # store results S0 * ~S0 stfpsux 1,5,11 fsneg 1,1 stfpsux 1,5,11 stfpsux 3,5,11 +#if !defined HAVE_BGP dcbz 0,6 +#endif stfpsx 4,0,6 # store results S0 * ~S1 stfpsux 5,6,11 stfpsux 6,6,11 stfpsux 7,6,11 +#if !defined HAVE_BGP dcbz 0,7 +#endif stfpsx 8,0,7 # store results S1 * ~S1 stfpsux 9,7,11 fsneg 9,9 @@ -972,32 +1000,42 @@ loop7: # loop over time fxcsnsma 15,15,18,15 fxcsnsma 17,17,18,17 +#if !defined HAVE_BGP dcbz 0,6 +#endif stfpsx 0,0,6 # store results S0 * ~S1 stfpsux 1,6,11 stfpsux 2,6,11 stfpsux 3,6,11 +#if !defined HAVE_BGP dcbz 0,7 +#endif stfpsx 4,0,7 # store results S0 * ~S2 stfpsux 5,7,11 stfpsux 6,7,11 stfpsux 7,7,11 +#if !defined HAVE_BGP dcbz 0,8 +#endif stfpsx 8,0,8 # store results S1 * ~S1 stfpsux 9,8,11 fsneg 9,9 stfpsux 9,8,11 stfpsux 10,8,11 +#if !defined HAVE_BGP dcbz 0,9 +#endif stfpsx 11,0,9 # store results S1 * ~S2 stfpsux 12,9,11 stfpsux 13,9,11 stfpsux 14,9,11 +#if !defined HAVE_BGP dcbz 0,10 +#endif stfpsx 15,0,10 # store results S2 * ~S2 stfpsux 16,10,11 fsneg 16,16 @@ -1139,7 +1177,19 @@ _add_correlations: .align 5 .global _clear_correlation _clear_correlation: +#if defined HAVE_BGP + li 0,0 + stw 0,0(3) + stw 0,4(3) + stw 0,8(3) + stw 0,12(3) + stw 0,16(3) + stw 0,20(3) + stw 0,24(3) + stw 0,28(3) +#else dcbz 0,3 # clear the entire cache line, it contains all +#endif blr # polarizations diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.h b/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.h index 6293a7d2321..0b01c93afde 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.h +++ b/Appl/CEP/CS1/CS1_BGLProc/src/CorrelatorAsm.h @@ -21,7 +21,7 @@ #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_CORRELATOR_ASM_H #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_CORRELATOR_ASM_H -#if defined HAVE_BGL +#if defined HAVE_BGL || defined HAVE_BGP #include <CS1_Interface/CS1_Config.h> namespace LOFAR { diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.S b/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.S index 2ef3f3b5dcc..734479ccc95 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.S +++ b/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.S @@ -1,4 +1,4 @@ -#if defined HAVE_BGL +#if defined HAVE_BGL || defined HAVE_BGP _C1: .long 0x3F800000, 0x3F800000 _W: .long 0x3F6C835E, 0xBEC3EF15 diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.h b/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.h index 1b5d4f8fd33..1f45873b97a 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.h +++ b/Appl/CEP/CS1/CS1_BGLProc/src/FFT_Asm.h @@ -21,7 +21,7 @@ #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_FFT_ASM_H #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_FFT_ASM_H -#if defined HAVE_BGL +#if defined HAVE_BGL || defined HAVE_BGP #include <Common/lofar_complex.h> namespace LOFAR { diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/FIR.h b/Appl/CEP/CS1/CS1_BGLProc/src/FIR.h index 151d99437cb..8ae73d35218 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/FIR.h +++ b/Appl/CEP/CS1/CS1_BGLProc/src/FIR.h @@ -1,7 +1,7 @@ #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_FIR_H #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_FIR_H -#if 0 || !defined HAVE_BGL +#if 0 || !(defined HAVE_BGL || defined HAVE_BGP) #define FIR_C_IMPLEMENTATION #endif diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S b/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S index bb2b949c4d4..20283314686 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S +++ b/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.S @@ -17,7 +17,7 @@ # # $Id$ -#if defined HAVE_BGL +#if defined HAVE_BGL || defined HAVE_BGP #define BGL_PROCESSING #include <CS1_Interface/CS1_Config.h> @@ -93,7 +93,9 @@ L4: lfpsux 2,4,6 lfpsux 3,4,6 +#if !defined HAVE_BGP dcbz 3,7 +#endif stfpsux 0,3,7 stfpsux 1,3,9 stfpsux 2,3,9 @@ -164,7 +166,11 @@ L5: fxcxnpma 3,3,10,15 ; lfpsux 7,4,7 fxcxnpma 8,8,9,16 +#if defined HAVE_BGP + fxcxnpma 10,10,11,17 +#else fxcxnpma 10,10,11,17 ; dcbz 3,6 +#endif fxpmul 12,4,8 ; stfpsux 0,3,6 fxpmul 13,5,8 ; stfpsux 1,3,10 @@ -180,7 +186,11 @@ L5: fxcxnpma 7,7,10,15 ; lfpsux 3,4,7 fxcxnpma 8,8,9,16 +#if defined HAVE_BGP + fxcxnpma 10,10,11,17 +#else fxcxnpma 10,10,11,17 ; dcbz 3,6 +#endif fxpmul 12,0,8 ; stfpsux 4,3,6 fxpmul 13,1,8 ; stfpsux 5,3,10 @@ -1604,9 +1614,29 @@ L8: cmpi 0,7,NR_SAMPLES_PER_INTEGRATION #endif +#if defined HAVE_BGP +zero: .long 0,0 +#endif + .align 5 .global _memzero _memzero: +#if defined HAVE_BGP + lis 5,zero@ha ; srwi 4,4,7 + addi 5,5,zero@l ; mtctr 4 + lfpsx 0,0,5 ; li 8,16 + subi 3,3,16 + +L1: stfpdux 0,3,8 + stfpdux 0,3,8 + stfpdux 0,3,8 + stfpdux 0,3,8 + stfpdux 0,3,8 + stfpdux 0,3,8 + stfpdux 0,3,8 + stfpdux 0,3,8 + bdnz L1 +#else srwi 4,4,7 mtctr 4 li 4,32 @@ -1614,13 +1644,13 @@ _memzero: li 6,96 li 7,128 -L1: - dcbz 0,3 +L1: dcbz 0,3 dcbz 3,4 dcbz 3,5 dcbz 3,6 add 3,3,7 bdnz L1 +#endif blr diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h b/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h index 709cc3d6c99..c40ce8e59ff 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h +++ b/Appl/CEP/CS1/CS1_BGLProc/src/FIR_Asm.h @@ -21,10 +21,12 @@ #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_FIR_ASM_H #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_FIR_ASM_H -#if defined HAVE_BGL +#if defined HAVE_BGL || defined HAVE_BGP #include <CS1_Interface/CS1_Config.h> +#if defined HAVE_BGL #include <rts.h> +#endif namespace LOFAR { @@ -65,7 +67,10 @@ extern "C" { unsigned nr_polarizations; } _FIR_constants_used; +#if defined HAVE_BGL void _bgl_mutex_lock(BGL_Mutex *), _bgl_mutex_unlock(BGL_Mutex *); +#endif + unsigned long long _rdtsc(); }; diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.cc b/Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.cc new file mode 100644 index 00000000000..60727cf0127 --- /dev/null +++ b/Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.cc @@ -0,0 +1,134 @@ +#include <lofar_config.h> + +#include <LocationInfo.h> + +#include <CS1_Interface/BGL_Mapping.h> +#include <CS1_Interface/PrintVector.h> +#include <Transport/TH_MPI.h> + +#if defined HAVE_BGP +#include <common/bgp_personality_inlines.h> +#include <spi/kernel_interface.h> +#endif + + +#include <iostream> + + +namespace LOFAR { +namespace CS1 { + +LocationInfo::LocationInfo() +{ +#if defined HAVE_BGP || defined HAVE_BGL + getPersonality(); +#endif + +} + + +#if defined HAVE_BGP +void LocationInfo::getPersonality() +{ + if (Kernel_GetPersonality(&itsPersonality, sizeof itsPersonality) != 0) { + std::cerr << "could not get personality" << std::endl; + exit(1); + } + + if (TH_MPI::getCurrentRank() == 0) + std::clog << "topology = (" + << BGP_Personality_xSize(&itsPersonality) << ',' + << BGP_Personality_ySize(&itsPersonality) << ',' + << BGP_Personality_zSize(&itsPersonality) << "), torus wraparound = (" + << (BGP_Personality_isTorusX(&itsPersonality) ? 'T' : 'F') << ',' + << (BGP_Personality_isTorusY(&itsPersonality) ? 'T' : 'F') << ',' + << (BGP_Personality_isTorusZ(&itsPersonality) ? 'T' : 'F') << ')' + << std::endl; + + itsPsetNumbers.resize(TH_MPI::getNumberOfNodes()); + itsPsetNumber = BGP_Personality_psetNum(&itsPersonality); + itsPsetNumbers[TH_MPI::getCurrentRank()] = itsPsetNumber; + + for (int core = 0; core < TH_MPI::getNumberOfNodes(); core ++) + MPI_Bcast(&itsPsetNumbers[core], 1, MPI_INT, core, MPI_COMM_WORLD); + + itsRankInPset = 0; + + for (int rank = 0; rank < TH_MPI::getCurrentRank(); rank ++) + if (itsPsetNumbers[rank] == itsPsetNumber) + ++ itsRankInPset; + + //usleep(100000 * TH_MPI::getCurrentRank()); + + if (TH_MPI::getCurrentRank() == 0) { + std::vector<std::vector<unsigned> > cores(BGP_Personality_numIONodes(&itsPersonality)); + + for (unsigned rank = 0; rank < itsPsetNumbers.size(); rank ++) + cores[itsPsetNumbers[rank]].push_back(rank); + + for (unsigned pset = 0; pset < BGP_Personality_numIONodes(&itsPersonality); pset ++) + std::clog << "pset " << pset << " contains cores " << cores[pset] << std::endl; + } +} + +#endif + +#if defined HAVE_BGL +void LocationInfo::getPersonality() +{ + if (rts_get_personality(&itsPersonality, sizeof(itsPersonality)) != 0) { + std::cerr << "could not get personality" << std::endl; + exit(1); + } + + if (TH_MPI::getCurrentRank == 0) + std::clog << "topology = (" + << itsPersonality.getXsize() << ',' + << itsPersonality.getYsize() << ',' + << itsPersonality.getZsize() << "), torus wraparound = (" + << (itsPersonality.isTorusX() ? 'T' : 'F') << ',' + << (itsPersonality.isTorusY() ? 'T' : 'F') << ',' + << (itsPersonality.isTorusZ() ? 'T' : 'F') << ')' + << std::endl; + + itsPsetNumbers.resize(TH_MPI::getNumberOfNodes()); + itsPsetNumber = itsPersonality.getPsetNum(); + itsPsetNumbers[TH_MPI::getCurrentRank()] = itsPsetNumber; + + for (int core = 0; core < TH_MPI::getNumberOfNodes(); core ++) + MPI_Bcast(&itsPsetNumbers[core], 1, MPI_INT, core, MPI_COMM_WORLD); + + itsRankInPset = 0; + + for (int rank = 0; rank < TH_MPI::getCurrentRank(); rank ++) + if (itsPsetNumbers[rank] == itsPsetNumber) + ++ itsRankInPset; + + //usleep(100000 * TH_MPI::getCurrentRank()); + + if (TH_MPI::getCurrentRank() == 0) { + std::vector<std::vector<unsigned> > cores(itsPersonality.numIONodes()); + + for (unsigned rank = 0; rank < itsPsetNumbers.size(); rank ++) + cores[itsPsetNumbers[rank]].push_back(rank); + + for (unsigned pset = 0; pset < itsPersonality.numPsets(); pset ++) + std::clog << "LocationInfo :: pset " << pset << " contains cores " << cores[pset] << std::endl; + } +} + +#endif + + +unsigned LocationInfo::remapOnTree(unsigned pset, unsigned core) const +{ + core = BGL_Mapping::mapCoreOnPset(core, pset); + + for (unsigned rank = 0;; rank ++) + if (itsPsetNumbers[rank] == pset && core -- == 0) + return rank; +} + + +} // namespace CS1 +} // namespace LOFAR diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.h b/Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.h new file mode 100644 index 00000000000..9d60cd265e4 --- /dev/null +++ b/Appl/CEP/CS1/CS1_BGLProc/src/LocationInfo.h @@ -0,0 +1,67 @@ +#ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_ALLOCATOR_H +#define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_ALLOCATOR_H + +#include <vector> + +#if defined HAVE_BGP +// we do not need mpi.h here, but including it after bgp_personality.h leads +// to compilation errors +#define MPICH_IGNORE_CXX_SEEK +#include <mpi.h> + +#include <common/bgp_personality.h> +#endif + +#if defined HAVE_BGL +#include <bglpersonality.h> +#include <rts.h> +#endif + + +namespace LOFAR { +namespace CS1 { + +class LocationInfo +{ + public: + LocationInfo(); + + unsigned remapOnTree(unsigned pset, unsigned core) const; + + unsigned psetNumber() const; + unsigned rankInPset() const; + + private: +#if defined HAVE_BGP || defined HAVE_BGL + void getPersonality(); +#endif + +#if defined HAVE_BGP + _BGP_Personality_t itsPersonality; + std::vector<unsigned> itsPsetNumbers; + unsigned itsPsetNumber, itsRankInPset; +#endif + +#if defined HAVE_BGL + BGLPersonality itsPersonality; + std::vector<unsigned> itsPsetNumbers; + unsigned itsPsetNumber, itsRankInPset; +#endif +}; + + +inline unsigned LocationInfo::psetNumber() const +{ + return itsPsetNumber; +} + + +inline unsigned LocationInfo::rankInPset() const +{ + return itsRankInPset; +} + +} // namespace CS1 +} // namespace LOFAR + +#endif diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Makefile.am b/Appl/CEP/CS1/CS1_BGLProc/src/Makefile.am index 2bd9b6f094b..108ab8d887a 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/Makefile.am +++ b/Appl/CEP/CS1/CS1_BGLProc/src/Makefile.am @@ -1,6 +1,6 @@ INSTHDRS =\ Package__Version.h \ -Allocator.h \ +LocationInfo.h \ CorrelatorAsm.h \ FFT_Asm.h \ FIR_Asm.h \ @@ -34,7 +34,7 @@ CCASFLAGS = $(patsubst -q%,,$(CPPFLAGS)) $(EXTRA_CPPFLAGS) CS1_BGL_Processing_SOURCES = $(DOCHDRS) \ Package__Version.cc \ -Allocator.cc \ +LocationInfo.cc \ CorrelatorAsm.S \ FIR_Asm.S \ FFT_Asm.S \ diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc b/Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc index 40f2e825dcb..de4ea11b9f7 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc +++ b/Appl/CEP/CS1/CS1_BGLProc/src/PPF.cc @@ -197,7 +197,7 @@ void PPF::filter(double centerFrequency, const TransposedData *transposedData, F #endif #if defined PPF_C_IMPLEMENTATION - fcomplex fftOutData[NR_SUBBAND_CHANNELS]; + fcomplex fftOutData[NR_SUBBAND_CHANNELS] __attribute__ ((aligned(sizeof(fcomplex)))); FIRtimer.start(); for (unsigned pol = 0; pol < NR_POLARIZATIONS; pol ++) { diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/PPF.h b/Appl/CEP/CS1/CS1_BGLProc/src/PPF.h index 81831cafebc..c536655bb59 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/PPF.h +++ b/Appl/CEP/CS1/CS1_BGLProc/src/PPF.h @@ -1,7 +1,7 @@ #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_PPF_H #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_PPF_H -#if 0 || !defined HAVE_BGL +#if 0 || !(defined HAVE_BGL || defined HAVE_BGP) #define PPF_C_IMPLEMENTATION #endif diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.cc b/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.cc index 42745052ff3..ca075177b38 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.cc +++ b/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.cc @@ -10,6 +10,9 @@ #if defined HAVE_BGL #include <rts.h> +//#elif defined HAVE_BGP +//#include <common/bgp_personality_inlines.h> +//#include <spi/kernel_interface.h> #endif #include <cassert> @@ -44,48 +47,68 @@ Transpose::~Transpose() } -#if defined HAVE_BGL +// #if defined HAVE_BGL -unsigned Transpose::remapOnTree(unsigned pset, unsigned core, const struct BGLPersonality &personality) -{ - unsigned psetXsize = personality.getXpsetSize(); - unsigned psetYsize = personality.getYpsetSize(); - unsigned psetZsize = personality.getZpsetSize(); +// unsigned Transpose::remapOnTree(unsigned pset, unsigned core, const struct BGLPersonality &personality) +// { +// unsigned psetXsize = personality.getXpsetSize(); +// unsigned psetYsize = personality.getYpsetSize(); +// unsigned psetZsize = personality.getZpsetSize(); - unsigned psetXcount = personality.getXsize() / psetXsize; - unsigned psetYcount = personality.getYsize() / psetYsize; - unsigned psetZcount = personality.getZsize() / psetZsize; +// unsigned psetXcount = personality.getXsize() / psetXsize; +// unsigned psetYcount = personality.getYsize() / psetYsize; +// unsigned psetZcount = personality.getZsize() / psetZsize; - unsigned xOrigin = pset % psetXcount * psetXsize; - unsigned yOrigin = pset / psetXcount % psetYcount * psetYsize; - unsigned zOrigin = pset / psetXcount / psetYcount % psetZcount * psetZsize; +// unsigned xOrigin = pset % psetXcount * psetXsize; +// unsigned yOrigin = pset / psetXcount % psetYcount * psetYsize; +// unsigned zOrigin = pset / psetXcount / psetYcount % psetZcount * psetZsize; - unsigned nodesPerPset = personality.numNodesInPset(); +// unsigned nodesPerPset = personality.numNodesInPset(); - unsigned numProcs, xOffset, yOffset, zOffset, node; +// unsigned numProcs, xOffset, yOffset, zOffset, node; - core = BGL_Mapping::mapCoreOnPset(core, pset); - personality.coordsForPsetRank(core % nodesPerPset, xOffset, yOffset, zOffset); +// core = BGL_Mapping::mapCoreOnPset(core, pset); +// personality.coordsForPsetRank(core % nodesPerPset, xOffset, yOffset, zOffset); - unsigned x = xOrigin + xOffset - personality.xPsetOrigin(); - unsigned y = yOrigin + yOffset - personality.yPsetOrigin(); - unsigned z = zOrigin + zOffset - personality.zPsetOrigin(); - unsigned t = core / nodesPerPset; +// unsigned x = xOrigin + xOffset - personality.xPsetOrigin(); +// unsigned y = yOrigin + yOffset - personality.yPsetOrigin(); +// unsigned z = zOrigin + zOffset - personality.zPsetOrigin(); +// unsigned t = core / nodesPerPset; - rts_rankForCoordinates(x, y, z, t, &node, &numProcs); +// rts_rankForCoordinates(x, y, z, t, &node, &numProcs); -#if defined HAVE_MPI - if (node >= (unsigned) TH_MPI::getNumberOfNodes()) { - std::cerr << "not enough nodes allocated (node = " << node << ", TH_MPI::getNumberOfNodes() = " << TH_MPI::getNumberOfNodes() << std::endl; - exit(1); - } -#endif +// #if defined HAVE_MPI +// if (node >= (unsigned) TH_MPI::getNumberOfNodes()) { +// std::cerr << "not enough nodes allocated (node = " << node << ", TH_MPI::getNumberOfNodes() = " << TH_MPI::getNumberOfNodes() << std::endl; +// exit(1); +// } +// #endif + +// return node; +// } - return node; +// #elif defined HAVE_BGP +#if defined HAVE_BGL || defined HAVE_BGP + +unsigned Transpose::remapOnTree(unsigned pset, unsigned core, const std::vector<unsigned> &psetNumbers) +{ + core = BGL_Mapping::mapCoreOnPset(core, pset); + + for (unsigned rank = 0;; rank ++) + if (psetNumbers[rank] == pset && core -- == 0) + return rank; } +#endif + -void Transpose::getMPIgroups(unsigned nrCoresPerPset, const struct BGLPersonality &personality, const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets) +// #if defined HAVE_BGL +// void Transpose::getMPIgroups(unsigned nrCoresPerPset, const struct BGLPersonality &personality, const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets) +// #elif defined HAVE_BGP +#if defined HAVE_BGL || defined HAVE_BGP +void Transpose::getMPIgroups(unsigned nrCoresPerPset, const LocationInfo &locationInfo, const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets) +#endif +#if defined HAVE_BGL || defined HAVE_BGP { allTransposeGroups.resize(nrCoresPerPset); @@ -105,10 +128,14 @@ void Transpose::getMPIgroups(unsigned nrCoresPerPset, const struct BGLPersonalit std::vector<int> ranks; for (std::set<unsigned>::const_iterator pset = psets.begin(); pset != psets.end(); pset ++) +#if 0 // defined HAVE_BGL ranks.push_back(remapOnTree(*pset, core, personality)); +#else + ranks.push_back(locationInfo.remapOnTree(*pset, core)); +#endif if (TH_MPI::getCurrentRank() == 0) - std::clog << "group " << core << " contains cores " << ranks << std::endl; + std::clog << "Transpose :: group " << core << " contains cores " << ranks << std::endl; if (MPI_Group_incl(all, ranks.size(), &ranks[0], &group) != MPI_SUCCESS) { std::cerr << "MPI_Group_incl() failed" << std::endl; diff --git a/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.h b/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.h index 222ca7f11c2..6d12bab68d9 100644 --- a/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.h +++ b/Appl/CEP/CS1/CS1_BGLProc/src/Transpose.h @@ -1,19 +1,22 @@ + #ifndef LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_TRANSPOSE_H #define LOFAR_APPL_CEP_CS1_CS1_BGL_PROC_TRANSPOSE_H #include <InputData.h> +#include <LocationInfo.h> #include <TransposedData.h> #include <boost/multi_array.hpp> -#if defined HAVE_BGL -#include <bglpersonality.h> -#endif - #if defined HAVE_MPI +#define MPICH_IGNORE_CXX_SEEK #include <mpi.h> #endif +#if defined HAVE_BGL +#include <bglpersonality.h> +#endif + #include <vector> @@ -28,10 +31,14 @@ class Transpose { ~Transpose(); void setupTransposeParams(const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets, InputData *, TransposedData *); - static void getMPIgroups(unsigned nrCoresPerPset, const BGLPersonality &, const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets); -#if defined HAVE_BGL - static unsigned remapOnTree(unsigned pset, unsigned core, const struct BGLPersonality &); +/* #if defined HAVE_BGL */ +/* static void getMPIgroups(unsigned nrCoresPerPset, const BGLPersonality &, const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets); */ +/* static unsigned remapOnTree(unsigned pset, unsigned core, const struct BGLPersonality &); */ +/* #elif defined HAVE_BGP */ +#if defined HAVE_BGL || HAVE_BGP + static void getMPIgroups(unsigned nrCoresPerPset, const LocationInfo &, const std::vector<unsigned> &inputPsets, const std::vector<unsigned> &outputPsets); + static unsigned remapOnTree(unsigned pset, unsigned core, const std::vector<unsigned> &psetNumbers); #endif void transpose(const InputData *, TransposedData *); -- GitLab