Skip to content
Snippets Groups Projects
Commit 7fec9606 authored by Jan David Mol's avatar Jan David Mol
Browse files

Task #4716: Sort devices based on PCI ID, to avoid CUDA messing up the order...

Task #4716: Sort devices based on PCI ID, to avoid CUDA messing up the order between MPI instances on the same node
parent ace336c7
No related branches found
No related tags found
No related merge requests found
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <algorithm> // for std::min #include <algorithm> // for std::min
#include <boost/noncopyable.hpp> #include <boost/noncopyable.hpp>
#include <boost/format.hpp>
#include <Common/Exception.h> #include <Common/Exception.h>
#include <Common/LofarLogger.h> #include <Common/LofarLogger.h>
...@@ -48,6 +49,8 @@ ...@@ -48,6 +49,8 @@
LOFAR::Exception::TerminateHandler th(LOFAR::Exception::terminate); LOFAR::Exception::TerminateHandler th(LOFAR::Exception::terminate);
using boost::format;
namespace LOFAR namespace LOFAR
{ {
namespace Cobalt namespace Cobalt
...@@ -244,6 +247,12 @@ namespace LOFAR ...@@ -244,6 +247,12 @@ namespace LOFAR
devices.push_back(Device(i)); devices.push_back(Device(i));
} }
// sort to get a predictable order,
// because CUDA derives its own sorting
// based on expected performance, which
// might differ per NUMA binding.
sort(devices.begin(), devices.end());
return devices; return devices;
} }
...@@ -258,6 +267,11 @@ namespace LOFAR ...@@ -258,6 +267,11 @@ namespace LOFAR
checkCuCall(cuDeviceGet(&_device, ordinal)); checkCuCall(cuDeviceGet(&_device, ordinal));
} }
bool Device::operator<(const Device &other) const
{
return pciId() < other.pciId();
}
std::string Device::getName() const std::string Device::getName() const
{ {
char name[1024]; char name[1024];
...@@ -314,6 +328,14 @@ namespace LOFAR ...@@ -314,6 +328,14 @@ namespace LOFAR
return (size_t)getAttribute(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY); return (size_t)getAttribute(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY);
} }
std::string Device::pciId() const
{
int bus = getAttribute(CU_DEVICE_ATTRIBUTE_PCI_BUS_ID);
int device = getAttribute(CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID);
return str(format("%04x:%04x") % bus % device);
}
int Device::getAttribute(CUdevice_attribute attribute) const int Device::getAttribute(CUdevice_attribute attribute) const
{ {
int value; int value;
......
...@@ -117,6 +117,9 @@ namespace LOFAR ...@@ -117,6 +117,9 @@ namespace LOFAR
// valid range: [0, Platform.size()-1] // valid range: [0, Platform.size()-1]
Device(int ordinal = 0); Device(int ordinal = 0);
// Order Devices by PCI ID (used in std::sort)
bool operator<(const Device &other) const;
// Return the name of the device in human readable form. // Return the name of the device in human readable form.
std::string getName() const; std::string getName() const;
...@@ -135,6 +138,9 @@ namespace LOFAR ...@@ -135,6 +138,9 @@ namespace LOFAR
// Return the total amount of constant memory // Return the total amount of constant memory
size_t getTotalConstMem() const; size_t getTotalConstMem() const;
// Return the PCI ID (bus:device) of this GPU
std::string pciId() const;
// Return information on a specific \a attribute. // Return information on a specific \a attribute.
// \param attribute CUDA device attribute // \param attribute CUDA device attribute
int getAttribute(CUdevice_attribute attribute) const; int getAttribute(CUdevice_attribute attribute) const;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment