Skip to content
Snippets Groups Projects
Commit 7fec9606 authored by Jan David Mol's avatar Jan David Mol
Browse files

Task #4716: Sort devices based on PCI ID, to avoid CUDA messing up the order...

Task #4716: Sort devices based on PCI ID, to avoid CUDA messing up the order between MPI instances on the same node
parent ace336c7
No related branches found
No related tags found
No related merge requests found
......@@ -27,6 +27,7 @@
#include <algorithm> // for std::min
#include <boost/noncopyable.hpp>
#include <boost/format.hpp>
#include <Common/Exception.h>
#include <Common/LofarLogger.h>
......@@ -48,6 +49,8 @@
LOFAR::Exception::TerminateHandler th(LOFAR::Exception::terminate);
using boost::format;
namespace LOFAR
{
namespace Cobalt
......@@ -244,6 +247,12 @@ namespace LOFAR
devices.push_back(Device(i));
}
// sort to get a predictable order,
// because CUDA derives its own sorting
// based on expected performance, which
// might differ per NUMA binding.
sort(devices.begin(), devices.end());
return devices;
}
......@@ -258,6 +267,11 @@ namespace LOFAR
checkCuCall(cuDeviceGet(&_device, ordinal));
}
bool Device::operator<(const Device &other) const
{
return pciId() < other.pciId();
}
std::string Device::getName() const
{
char name[1024];
......@@ -314,6 +328,14 @@ namespace LOFAR
return (size_t)getAttribute(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY);
}
std::string Device::pciId() const
{
int bus = getAttribute(CU_DEVICE_ATTRIBUTE_PCI_BUS_ID);
int device = getAttribute(CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID);
return str(format("%04x:%04x") % bus % device);
}
int Device::getAttribute(CUdevice_attribute attribute) const
{
int value;
......
......@@ -117,6 +117,9 @@ namespace LOFAR
// valid range: [0, Platform.size()-1]
Device(int ordinal = 0);
// Order Devices by PCI ID (used in std::sort)
bool operator<(const Device &other) const;
// Return the name of the device in human readable form.
std::string getName() const;
......@@ -135,6 +138,9 @@ namespace LOFAR
// Return the total amount of constant memory
size_t getTotalConstMem() const;
// Return the PCI ID (bus:device) of this GPU
std::string pciId() const;
// Return information on a specific \a attribute.
// \param attribute CUDA device attribute
int getAttribute(CUdevice_attribute attribute) const;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment