diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.cc b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.cc index 6e217c691c35805472b27d7394797b2b12500b7f..f4ad933533cdb9c0cd11221228ed4d0466c91e64 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.cc +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.cc @@ -58,7 +58,8 @@ namespace LOFAR const Parameters ¶ms) : gpu::Function(function), - maxThreadsPerBlock(stream.getContext().getDevice().getMaxThreadsPerBlock()), + maxThreadsPerBlock( + stream.getContext().getDevice().getMaxThreadsPerBlock()), itsStream(stream), itsBuffers(buffers), itsParameters(params) @@ -71,27 +72,37 @@ namespace LOFAR function.getAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS)); } - void Kernel::setEnqueueWorkSizes(gpu::Grid globalWorkSize, gpu::Block localWorkSize) + void Kernel::setEnqueueWorkSizes(gpu::Grid globalWorkSize, + gpu::Block localWorkSize) { gpu::Grid grid; ostringstream errMsgs; - // Enforce by the hardware supported work sizes to see errors clearly and early. + // Enforce by the hardware supported work sizes to see errors clearly and + // early. - gpu::Block maxLocalWorkSize = itsStream.getContext().getDevice().getMaxBlockDims(); + gpu::Block maxLocalWorkSize = + itsStream.getContext().getDevice().getMaxBlockDims(); if (localWorkSize.x > maxLocalWorkSize.x || localWorkSize.y > maxLocalWorkSize.y || localWorkSize.z > maxLocalWorkSize.z) - errMsgs << " - localWorkSize must be at most " << maxLocalWorkSize << endl; - - if (localWorkSize.x * localWorkSize.y * localWorkSize.z > maxThreadsPerBlock) - errMsgs << " - localWorkSize total must be at most " << maxThreadsPerBlock << " threads/block" << endl; - - // globalWorkSize may (in theory) be all zero (no work). Reject such localWorkSize. - if (localWorkSize.x == 0 || localWorkSize.y == 0 || localWorkSize.z == 0) { + errMsgs << " - localWorkSize must be at most " << maxLocalWorkSize + << endl; + + if (localWorkSize.x * localWorkSize.y * localWorkSize.z > + maxThreadsPerBlock) + errMsgs << " - localWorkSize total must be at most " + << maxThreadsPerBlock << " threads/block" << endl; + + // globalWorkSize may (in theory) be all zero (no work). Reject such + // localWorkSize. + if (localWorkSize.x == 0 || + localWorkSize.y == 0 || + localWorkSize.z == 0) { errMsgs << " - localWorkSize components must be non-zero" << endl; } else { - // TODO: to globalWorkSize in terms of localWorkSize (CUDA) ('gridWorkSize'). + // TODO: to globalWorkSize in terms of localWorkSize (CUDA) + // ('gridWorkSize'). if (globalWorkSize.x % localWorkSize.x != 0 || globalWorkSize.y % localWorkSize.y != 0 || globalWorkSize.z % localWorkSize.z != 0) @@ -100,19 +111,21 @@ namespace LOFAR globalWorkSize.y / localWorkSize.y, globalWorkSize.z / localWorkSize.z); - gpu::Grid maxGridWorkSize = itsStream.getContext().getDevice().getMaxGridDims(); + gpu::Grid maxGridWorkSize = + itsStream.getContext().getDevice().getMaxGridDims(); if (grid.x > maxGridWorkSize.x || grid.y > maxGridWorkSize.y || grid.z > maxGridWorkSize.z) - errMsgs << " - globalWorkSize / localWorkSize must be at most " << maxGridWorkSize << endl; + errMsgs << " - globalWorkSize / localWorkSize must be at most " + << maxGridWorkSize << endl; } - string errStr(errMsgs.str()); if (!errStr.empty()) - THROW(gpu::GPUException, "setEnqueueWorkSizes(): unsupported globalWorkSize " << - globalWorkSize << " and/or localWorkSize " << localWorkSize << " selected:" << - endl << errStr); + THROW(gpu::GPUException, + "setEnqueueWorkSizes(): unsupported globalWorkSize " << + globalWorkSize << " and/or localWorkSize " << localWorkSize << + " selected:" << endl << errStr); LOG_DEBUG_STR("CUDA Grid size: " << grid); LOG_DEBUG_STR("CUDA Block size: " << localWorkSize); diff --git a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.h b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.h index 701842f84bbe0deb33b3fb683f25597ee8ea5951..591f8a7f333e1f6d98373ead29a5523a688e2e17 100644 --- a/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.h +++ b/RTCP/Cobalt/GPUProc/src/cuda/Kernels/Kernel.h @@ -41,7 +41,9 @@ namespace LOFAR { public: // Parameters that must be passed to the constructor of this Kernel class. - // TODO: more at constructor passed immediates can be turned into defines (blockDim/gridDim too if enforced fixed (consider conditional define) or drop opt) + // TODO: more at constructor passed immediates can be turned into defines + // (blockDim/gridDim too if enforced fixed (consider conditional define) + // or drop opt) struct Parameters { Parameters(const Parset& ps); @@ -85,7 +87,8 @@ namespace LOFAR // Explicit destructor, because the implicitly generated one is public. ~Kernel(); - void setEnqueueWorkSizes(gpu::Grid globalWorkSize, gpu::Block localWorkSize); + void setEnqueueWorkSizes(gpu::Grid globalWorkSize, + gpu::Block localWorkSize); const unsigned maxThreadsPerBlock;