Skip to content
Snippets Groups Projects
Commit 96d2c871 authored by Jan David Mol's avatar Jan David Mol
Browse files

Merge branch 'optimize-inttofloat' into 'main'

IntToFloatKernel: increase parallelism

See merge request lofar2.0/cobalt!30
parents d4b8a83d e3956695
No related branches found
No related tags found
1 merge request!30IntToFloatKernel: increase parallelism
...@@ -93,6 +93,7 @@ __global__ void intToFloat(void *convertedDataPtr, ...@@ -93,6 +93,7 @@ __global__ void intToFloat(void *convertedDataPtr,
uint station_in = blockIdx.y; uint station_in = blockIdx.y;
uint station_out = blockIdx.y; uint station_out = blockIdx.y;
#endif #endif
uint time_idx = threadIdx.x + blockIdx.x * blockDim.x;
#ifdef DO_FFTSHIFT #ifdef DO_FFTSHIFT
// Multiplication factor: 1 for even samples, -1 for odd samples // Multiplication factor: 1 for even samples, -1 for odd samples
...@@ -103,8 +104,12 @@ __global__ void intToFloat(void *convertedDataPtr, ...@@ -103,8 +104,12 @@ __global__ void intToFloat(void *convertedDataPtr,
#endif #endif
// For even increases, we always process either even or odd samples // For even increases, we always process either even or odd samples
for (int time = threadIdx.x; time < NR_SAMPLES_PER_SUBBAND; time += blockDim.x) for (int time = time_idx; time < NR_SAMPLES_PER_SUBBAND; time += blockDim.x * gridDim.x)
{ {
if (time >= NR_SAMPLES_PER_SUBBAND) {
break;
}
float4 sample; float4 sample;
sample = make_float4(convertIntToFloat(REAL((*sampledData)[station_in][time][0])) * factor, sample = make_float4(convertIntToFloat(REAL((*sampledData)[station_in][time][0])) * factor,
......
...@@ -112,7 +112,9 @@ namespace LOFAR ...@@ -112,7 +112,9 @@ namespace LOFAR
stream.writeBuffer(stationIndices, stationIndicesHost, true); stream.writeBuffer(stationIndices, stationIndicesHost, true);
ASSERTSTR(maxThreadsPerBlock % 2 == 0, "IntToFloat.cu requires an even stepsize."); ASSERTSTR(maxThreadsPerBlock % 2 == 0, "IntToFloat.cu requires an even stepsize.");
setEnqueueWorkSizes( gpu::Grid(1, params.nrOutputStations()), const gpu::Device device(_context.getDevice());
const int nrMPs = device.getAttribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
setEnqueueWorkSizes( gpu::Grid(nrMPs, params.nrOutputStations()),
gpu::Block(maxThreadsPerBlock) ); gpu::Block(maxThreadsPerBlock) );
unsigned nrSamples = params.nrOutputStations() * params.nrSamplesPerSubband * NR_POLARIZATIONS; unsigned nrSamples = params.nrOutputStations() * params.nrSamplesPerSubband * NR_POLARIZATIONS;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment