Select Git revision
CudaRuntimeCompiler.cc
Alexander van Amesfoort authored
- Adapt to Jan David's changes to gpu_wrapper, mainly context deps - Move context storage point to WorkQueue. But the workqueue cannot create the context, as Pipeline needs it first to build kernels. - Copy filterbank weights to pinned CPU memory. Not really for perf, but to make the transfer compile. I hope I didn't break it. - Kernels are still compiled in Pipeline, but moved from doWork() to CorrelatorPipeline(). Build dupl can now be optimized out easier. - Put original default defines in CudaRuntimeCompiler, namespace to LOFAR::Cobalt (flags_type and definitions_type a bit generic now). - Seems nvcc doesn't accept my -I (?), but I haven't been trying hard. This has to be extended wrt testing vs installed location. - Store workqueue objects in Pipeline. No longer store CorrelatePipeline programs in Pipeline, in CorrelatorPipeline() seems enough. - Convert data copying to our functions. Buffer flags still have some cl taste, but much less relevant in CUDA. Fix up another time. - Reduce data size of tStreamReadBuffer, tFFT for my laptop GPU. - Move USE_B7015 set_affinity() out of the way to right after OpenMP task creation. - Misc: fix some typos, add missing iomanip include, improve some comments, add TODOs. Only a couple of changes went into OpenCL, 90+% did not. Not all changes applied to bf and UHEP files, because they were already different. Unfort, no tests yet, though existing tests are ok. After, still significant refactoring can (should) be done in Buffers.h. Maybe not in this sprint.
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
CudaRuntimeCompiler.cc 7.06 KiB
//# CudaRuntimeCompiler.cc
//# Copyright (C) 2013 ASTRON (Netherlands Institute for Radio Astronomy)
//# P.O. Box 2, 7990 AA Dwingeloo, The Netherlands
//#
//# This file is part of the LOFAR software suite.
//# The LOFAR software suite is free software: you can redistribute it and/or
//# modify it under the terms of the GNU General Public License as published
//# by the Free Software Foundation, either version 3 of the License, or
//# (at your option) any later version.
//#
//# The LOFAR software suite is distributed in the hope that it will be useful,
//# but WITHOUT ANY WARRANTY; without even the implied warranty of
//# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//# GNU General Public License for more details.
//#
//# You should have received a copy of the GNU General Public License along
//# with the LOFAR software suite. If not, see <http://www.gnu.org/licenses/>.
//#
//# $Id$
#include <lofar_config.h>
#include "CudaRuntimeCompiler.h"
#include <cstdio> // popen, pget
#include <stdexcept>
#include <iostream>
#include <string>
#include <sstream>
#include <map>
#include <set>
#include <boost/format.hpp>
#include <Common/Exception.h>
#include <Common/SystemCallException.h>
#include <Common/LofarLogger.h>
#include <Common/SystemUtil.h>
#include <GPUProc/global_defines.h>
using namespace std;
// Collection of functions needed for runtime compilation of a kernel supplied
// as a path to a ptx string.
namespace LOFAR
{
namespace Cobalt
{
// flags
typedef std::set<std::string> flags_type;
// defines
typedef std::map<std::string, std::string> definitions_type;
// Return the set of default flags for the nvcc compilation of a cuda kernel in Cobalt
flags_type defaultFlags()
{
flags_type flags;
using boost::format;
//flags.insert("device-debug");
flags.insert("use_fast_math");
// gpu-architecture and -Ipath are set by createPTX()
return flags;
};
// Return empty set of definitions for the nvcc compilation of a cuda kernel
definitions_type defaultDefinitions()
{
definitions_type defs;
return defs;
}
// Return the set of default definitions for the nvcc compilation of a cuda kernel in Cobalt
definitions_type defaultDefinitions(const Parset &ps)
{
definitions_type defs;
using boost::format;
defs["NVIDIA_CUDA"] = ""; // left-over from OpenCL for Correlator.cl/.cu
// TODO: support device specific defs somehow (createPTX() knows about targets, but may be kernel and target specific)
//if (devices[0].getInfo<CL_DEVICE_NAME>() == "GeForce GTX 680")
// defs["USE_FLOAT4_IN_CORRELATOR"] = "";
// TODO: kernel-specific defs should be specified in the XXXKernel class
defs["NR_BITS_PER_SAMPLE"] = str(format("%u") % ps.nrBitsPerSample());
defs["SUBBAND_BANDWIDTH"] = str(format("%.7ff") % ps.subbandBandwidth()); // returns double, so rounding issue?
defs["NR_SUBBANDS"] = str(format("%u") % ps.nrSubbands()); // size_t, but %zu not supp
defs["NR_CHANNELS"] = str(format("%u") % ps.nrChannelsPerSubband());
defs["NR_STATIONS"] = str(format("%u") % ps.nrStations());
defs["NR_SAMPLES_PER_CHANNEL"] = str(format("%u") % ps.nrSamplesPerChannel());
defs["NR_SAMPLES_PER_SUBBAND"] = str(format("%u") % ps.nrSamplesPerSubband());
defs["NR_BEAMS"] = str(format("%u") % ps.nrBeams());
defs["NR_TABS"] = str(format("%u") % ps.nrTABs(0)); // TODO: 0 should be dep on #beams
defs["NR_COHERENT_STOKES"] = str(format("%u") % ps.nrCoherentStokes()); // size_t
defs["NR_INCOHERENT_STOKES"] = str(format("%u") % ps.nrIncoherentStokes()); // size_t
defs["COHERENT_STOKES_TIME_INTEGRATION_FACTOR"] = str(format("%u") % ps.coherentStokesTimeIntegrationFactor());
defs["INCOHERENT_STOKES_TIME_INTEGRATION_FACTOR"] = str(format("%u") % ps.incoherentStokesTimeIntegrationFactor());
defs["NR_POLARIZATIONS"] = str(format("%u") % NR_POLARIZATIONS);
defs["NR_TAPS"] = str(format("%u") % NR_TAPS);
defs["NR_STATION_FILTER_TAPS"] = str(format("%u") % NR_STATION_FILTER_TAPS);
if (ps.delayCompensation())
defs["DELAY_COMPENSATION"] = "";
if (ps.correctBandPass())
defs["BANDPASS_CORRECTION"] = "";
defs["DEDISPERSION_FFT_SIZE"] = str(format("%u") % ps.dedispersionFFTsize()); // size_t
return defs;
}
// Performs a 'system' call of nvcc. Return the stdout of the command
// on error no stdout is created and an exception is thrown
std::string runNVCC(const std::string &cmd)
{
// *******************************************************
// Call nvcc on the command line, get content as string
stringstream ptxFileContent; // The ptx file, to be read from stdout
char buffer [1024];
FILE * ptxFilePointer = popen(cmd.c_str(), "r" );
if (!ptxFilePointer)
throw SystemCallException("popen", errno, THROW_ARGS);
// Read the content of the file pointer
while ( ! feof (ptxFilePointer) ) //We do not get the cerr
{
if (fgets(buffer, sizeof buffer, ptxFilePointer) == NULL) // FILE * can only be read with cstdio functions
break;
ptxFileContent << buffer;
}
// Close file stream
if (fclose(ptxFilePointer) == EOF)
throw SystemCallException("fclose", errno, THROW_ARGS);
// *******************************************************
// Fetch and return PTX, if any
string ptxStr = ptxFileContent.str();
if (ptxStr.empty())
{
LOG_DEBUG(" NVCC Compilation of cuda kernel failed. Command line arguments:");
LOG_DEBUG(cmd);
// log that we have a failed compile run
THROW(Exception, "nvcc compilation failed!");
}
return ptxStr;
}
// Create a nvcc command line string based on the input path, a set of flags and a map
// of definitions. Use this command to call nvcc and compile the file at input path to a ptx file
// which content is returned as a string
std::string compileToPtx(const std::string& pathToCuFile, const flags_type& flags, const definitions_type& definitions)
{
const string cudaCompiler = "nvcc";
stringstream cmd("");
cmd << cudaCompiler ;
cmd << " " << pathToCuFile ;
cmd << " --ptx";
// add the set of flags
for (flags_type::const_iterator it=flags.begin(); it!=flags.end(); ++it)
cmd << " --" << *it; // flags should be prepended with a space and a minus
// add the map of defines
for (definitions_type::const_iterator it=definitions.begin(); it!=definitions.end(); ++it)
{
cmd << " -D" << it->first;
if (!it->second.empty())
cmd << "=" << it->second; // e.g. -DTEST=20
}
// output to stdout
cmd << " -o -";
LOG_INFO_STR("Runtime compilation of kernel, command: " << cmd.str());
return runNVCC(cmd.str());
};
// overloaded function. Use the path and default flags and definitions to call nvcc
std::string compileToPtx(const std::string& pathToCuFile)
{
// compile with the default flags and definitions
return compileToPtx(pathToCuFile, defaultFlags(), defaultDefinitions());
};
}
}