Skip to content
Snippets Groups Projects
Commit f72b3bf2 authored by Leon Oostrum's avatar Leon Oostrum
Browse files

Implement direct write of visibilities from registers on AMD GPUs

parent d752471c
No related branches found
No related tags found
1 merge request!13Draft: AMD support
Pipeline #96819 passed
......@@ -25,6 +25,9 @@
#ifdef __HIP_PLATFORM_AMD__
#if defined(__HIP_DEVICE_COMPILE__) && (!defined(__GFX9__) && !defined(__GFX11__))
#error this architecture has no suitable tensor cores
#elif defined(__HIP_DEVICE_COMPILE__) && NR_BITS == 8 && (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942))
// CDNA3 does not support 16x16x16 instruction, only 16x16x32 and 32x32x16
#error 8-bit mode is not supported on CDNA3
#endif
#else
#if __CUDA_ARCH__ < (NR_BITS == 4 ? 730 : NR_BITS == 8 ? 720 : NR_BITS == 16 ? 700 : 0)
......@@ -32,9 +35,15 @@
#endif
#endif
#ifdef __HIP_PLATFORM_AMD__
#if !defined(__GFX9__) && !defined(__GFX11__)
#define PORTABLE // unknown architecture -> write visibilities in portable way (via shared memory)
#endif
#else
#if __CUDA_ARCH__ != 700 && __CUDA_ARCH__ != 720 && __CUDA_ARCH__ != 750 && __CUDA_ARCH__ != 800 && __CUDA_ARCH__ != 860 && __CUDA_ARCH__ != 870 && __CUDA_ARCH__ != 890 && __CUDA_ARCH__ != 900
#define PORTABLE // unknown architecture -> write visibilities in portable way (via shared memory)
#endif
#endif
#if NR_RECEIVERS_PER_BLOCK != 32 && NR_RECEIVERS_PER_BLOCK != 48 && NR_RECEIVERS_PER_BLOCK != 64
#error unsupported NR_RECEIVERS_PER_BLOCK
......@@ -294,13 +303,20 @@ CUSTOM_STORE_VISIBILITY
template <bool add> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned polY, unsigned polX, Visibility visibility)
{
if (add) {
if (add)
visibilities[channel][baseline][polY][polX] += visibility;
} else
else
visibilities[channel][baseline][polY][polX] = visibility;
}
template <bool add> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned polY, unsigned polX, unsigned c, Value value)
{
if (add)
reinterpret_cast<Value *>(&visibilities[channel][baseline][polY][polX])[c] += value;
else
reinterpret_cast<Value *>(&visibilities[channel][baseline][polY][polX])[c] = value;
}
#endif
......@@ -311,6 +327,13 @@ template <bool add, typename T> __device__ inline void storeVisibility(Visibilit
}
template <bool add, typename T> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned recvY, unsigned recvX, unsigned polY, unsigned polX, unsigned c, bool skipCheckY, bool skipCheckX, T sum)
{
if ((skipCheckY || recvY <= recvX) && (skipCheckX || recvX < NR_RECEIVERS))
storeVisibility<add>(visibilities, channel, baseline, polY, polX, c, sum);
}
template <bool add>__device__ inline void storeVisibilities(Visibilities visibilities, unsigned channel, unsigned firstReceiverY, unsigned firstReceiverX, unsigned y, unsigned x, bool skipCheckY, bool skipCheckX, const Sum &sum, ScratchSpace scratchSpace[], unsigned warp)
{
#if defined PORTABLE
......@@ -351,6 +374,33 @@ template <bool add>__device__ inline void storeVisibilities(Visibilities visibil
visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX];
#endif
#else
#ifdef __HIP_PLATFORM_AMD__
#if NR_BITS == 8 || NR_BITS == 16
for (unsigned i = 0; i < sum.num_elements; i++) {
#if defined __GFX9__
unsigned row = 4 * (threadIdx.x / 16) + (i % 4);
#elif defined __GFX11__
unsigned row = (threadIdx.x / 16) + 2 * i;
#endif
unsigned col = threadIdx.x % 16;
unsigned recvY = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + row / 2;
unsigned polY = row % 2;
unsigned recvX = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + col / 4;
unsigned polX = (col / 2) % 2;
unsigned c = col % 2;
unsigned baseline = (recvX * (recvX + 1) / 2) + recvY;
storeVisibility<add>(visibilities, channel, baseline, recvY, recvX, polY, polX, c, skipCheckY, skipCheckX, sum.x[i]);
// ToDo: Figure out why the results are wrong in some cases without this sync. e.g. CorrelatorTest -b 16 -c 1 -n 33 -N 32 -r 1 -R 1 -t 8
#if NR_BITS == 16
__syncthreads();
#endif
}
#endif
#else
#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16)
unsigned recvY = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4);
unsigned recvX = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2);
......@@ -381,6 +431,7 @@ template <bool add>__device__ inline void storeVisibilities(Visibilities visibil
#endif
#endif
#endif
#endif
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment