Merge branch 'optimize-inttofloat' into 'main'

IntToFloatKernel: increase parallelism See merge request lofar2.0/cobalt!30

Merge branch 'optimize-inttofloat' into 'main'
96d2c871 · Jan David Mol · d4b8a83d · e3956695 · 96d2c871 · 96d2c871
Commit 96d2c871 authored 2 years ago by Jan David Mol
--- a/GPUProc/share/gpu/kernels/IntToFloat.cu
+++ b/GPUProc/share/gpu/kernels/IntToFloat.cu
@@ -93,6 +93,7 @@ __global__ void intToFloat(void *convertedDataPtr,
  uint station_in  = blockIdx.y;
  uint station_out = blockIdx.y;
 #endif
+  uint time_idx = threadIdx.x + blockIdx.x * blockDim.x;
 #ifdef DO_FFTSHIFT
  // Multiplication factor: 1 for even samples, -1 for odd samples
@@ -103,8 +104,12 @@ __global__ void intToFloat(void *convertedDataPtr,
 #endif
  // For even increases, we always process either even or odd samples
-  for (int time = threadIdx.x; time < NR_SAMPLES_PER_SUBBAND; time += blockDim.x)
+  for (int time = time_idx; time < NR_SAMPLES_PER_SUBBAND; time += blockDim.x * gridDim.x)
  {
+    if (time >= NR_SAMPLES_PER_SUBBAND) {
+      break;
+    }
    float4 sample;
    sample = make_float4(convertIntToFloat(REAL((*sampledData)[station_in][time][0])) * factor,

--- a/GPUProc/src/Kernels/IntToFloatKernel.cc
+++ b/GPUProc/src/Kernels/IntToFloatKernel.cc
@@ -112,7 +112,9 @@ namespace LOFAR
      stream.writeBuffer(stationIndices, stationIndicesHost, true);
      ASSERTSTR(maxThreadsPerBlock % 2 == 0, "IntToFloat.cu requires an even stepsize.");
-      setEnqueueWorkSizes( gpu::Grid(1, params.nrOutputStations()),
+      const gpu::Device device(_context.getDevice());
+      const int nrMPs = device.getAttribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
+      setEnqueueWorkSizes( gpu::Grid(nrMPs, params.nrOutputStations()),
                           gpu::Block(maxThreadsPerBlock) );
      unsigned nrSamples = params.nrOutputStations() * params.nrSamplesPerSubband * NR_POLARIZATIONS;