diff --git a/libtcc/kernel/TCCorrelator.cu b/libtcc/kernel/TCCorrelator.cu index fbb7469b9d95fc59e95580b198a8e404bb7c9774..52a1ef31169ffca3f4d2660a3feed1f673d0864c 100644 --- a/libtcc/kernel/TCCorrelator.cu +++ b/libtcc/kernel/TCCorrelator.cu @@ -265,8 +265,10 @@ __device__ inline float2 make_complex(float real, float imag) CUSTOM_STORE_VISIBILITY #else -template <bool add> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned polY, unsigned polX, Visibility visibility) +template <bool add> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned recvY, unsigned recvX, unsigned polY, unsigned polX, Visibility visibility) { + unsigned baseline = (recvX * (recvX + 1) / 2) + recvY; + if (add) visibilities[channel][baseline][polY][polX] += visibility; else @@ -276,10 +278,10 @@ template <bool add> __device__ inline void storeVisibility(Visibilities visibili #endif -template <bool add, typename T> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned recvY, unsigned recvX, unsigned tcY, unsigned tcX, unsigned polY, unsigned polX, bool skipCheckY, bool skipCheckX, T sumR, T sumI) +template <bool add, typename T> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned recvY, unsigned recvX, unsigned polY, unsigned polX, bool skipCheckY, bool skipCheckX, T sumR, T sumI) { - if ((skipCheckY || recvY + tcY <= recvX + tcX) && (skipCheckX || recvX + tcX < NR_RECEIVERS)) - storeVisibility<add>(visibilities, channel, baseline + tcX * recvX + tcX * (tcX + 1) / 2 + tcY, polY, polX, make_complex(sumR, sumI)); + if ((skipCheckY || recvY <= recvX) && (skipCheckX || recvX < NR_RECEIVERS)) + storeVisibility<add>(visibilities, channel, recvY, recvX, polY, polX, make_complex(sumR, sumI)); } @@ -323,32 +325,28 @@ template <bool add>__device__ inline void storeVisibilities(Visibilities visibil #endif #else #if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16) - unsigned recvY = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4); - unsigned recvX = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2); - unsigned polY = threadIdx.x & 1; - unsigned polX = (threadIdx.x >> 1) & 1; + unsigned polY = threadIdx.x & 1; + unsigned polX = (threadIdx.x >> 1) & 1; + unsigned recvY = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4); + unsigned recvX = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2); + + storeVisibility<add>(visibilities, channel, recvY + 0, recvX + 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); + storeVisibility<add>(visibilities, channel, recvY + 0, recvX + 1, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]); + storeVisibility<add>(visibilities, channel, recvY + 1, recvX + 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]); + storeVisibility<add>(visibilities, channel, recvY + 1, recvX + 1, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]); #elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 || __CUDA_ARCH__ == 900 - unsigned recvY = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 3); - unsigned recvX = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 1) & 1); - unsigned polY = (threadIdx.x >> 2) & 1; - unsigned polX = threadIdx.x & 1; -#endif + unsigned polY = (threadIdx.x >> 2) & 1; + unsigned polX = threadIdx.x & 1; + unsigned recvY = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 3); + unsigned recvX = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 1) & 1); - unsigned baseline = (recvX * (recvX + 1) / 2) + recvY; - -#if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16) - storeVisibility<add>(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); - storeVisibility<add>(visibilities, channel, baseline, recvY, recvX, 0, 1, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]); - storeVisibility<add>(visibilities, channel, baseline, recvY, recvX, 1, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]); - storeVisibility<add>(visibilities, channel, baseline, recvY, recvX, 1, 1, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]); -#elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 || __CUDA_ARCH__ == 900 - storeVisibility<add>(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); + storeVisibility<add>(visibilities, channel, recvY + 0, recvX + 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); #if NR_BITS == 8 || NR_BITS == 16 - storeVisibility<add>(visibilities, channel, baseline, recvY, recvX, 0, 2, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]); + storeVisibility<add>(visibilities, channel, recvY + 0, recvX + 2, polY, polX, skipCheckY, skipCheckX, sum.x[4], sum.x[5]); #endif - storeVisibility<add>(visibilities, channel, baseline, recvY, recvX, 4, 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]); + storeVisibility<add>(visibilities, channel, recvY + 4, recvX + 0, polY, polX, skipCheckY, skipCheckX, sum.x[2], sum.x[3]); #if NR_BITS == 8 || NR_BITS == 16 - storeVisibility<add>(visibilities, channel, baseline, recvY, recvX, 4, 2, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]); + storeVisibility<add>(visibilities, channel, recvY + 4, recvX + 2, polY, polX, skipCheckY, skipCheckX, sum.x[6], sum.x[7]); #endif #endif #endif