diff --git a/libtcc/TCCorrelator.cu b/libtcc/TCCorrelator.cu index 0e65d2cf5e47f6871facff0346d2c61568658b55..f046c9094d14b8d7761b2795d9a08a621c6ee9cc 100644 --- a/libtcc/TCCorrelator.cu +++ b/libtcc/TCCorrelator.cu @@ -8,14 +8,15 @@ #include <mma.h> -#define NR_BASELINES (NR_RECEIVERS * (NR_RECEIVERS + 1) / 2) -#define ALIGN(A,N) (((A)+(N)-1)/(N)*(N)) +#define NR_BASELINES (NR_RECEIVERS * (NR_RECEIVERS + 1) / 2) +#define ALIGN(A,N) (((A)+(N)-1)/(N)*(N)) -#define NR_TIMES_PER_BLOCK (128 / (NR_BITS)) -#define NR_RECEIVERS_PER_TCM_X ((NR_BITS) == 4 ? 2 : 4) -#define NR_RECEIVERS_PER_TCM_Y 8 +#define NR_TIMES_PER_BLOCK (128 / (NR_BITS)) +#define NR_RECEIVERS_PER_TCM_X ((NR_BITS) == 4 ? 2 : 4) +#define NR_RECEIVERS_PER_TCM_Y 8 +#define NR_RECEIVERS_PER_BLOCK_X (NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK) -#define COMPLEX 2 +#define COMPLEX 2 #if __CUDA_ARCH__ < (NR_BITS == 4 ? 730 : NR_BITS == 8 ? 720 : NR_BITS == 16 ? 700 : 0) #error this architecture has no suitable tensor cores @@ -267,12 +268,12 @@ template <typename T> __device__ inline void storeVisibility(Visibilities visibi template <typename T> __device__ inline void storeVisibility(Visibilities visibilities, unsigned channel, unsigned baseline, unsigned recvY, unsigned recvX, unsigned tcY, unsigned tcX, unsigned polY, unsigned polX, bool skipCheckY, bool skipCheckX, T sumR, T sumI) { - if ((skipCheckX || recvX + tcX <= recvY + tcY) && (skipCheckY || recvY + tcY < NR_RECEIVERS)) - storeVisibility(visibilities, channel, baseline + tcY * recvY + tcY * (tcY + 1) / 2 + tcX, polY, polX, make_complex(sumR, sumI)); + if ((skipCheckY || recvY + tcY <= recvX + tcX) && (skipCheckX || recvX + tcX < NR_RECEIVERS)) + storeVisibility(visibilities, channel, baseline + tcX * recvX + tcX * (tcX + 1) / 2 + tcY, polY, polX, make_complex(sumR, sumI)); } -__device__ inline void storeVisibilities(Visibilities visibilities, unsigned channel, unsigned firstReceiverY, unsigned firstReceiverX, unsigned recvYoffset, unsigned recvXoffset, unsigned y, unsigned x, bool skipCheckY, bool skipCheckX, const Sum &sum, ScratchSpace scratchSpace[], unsigned warp) +__device__ inline void storeVisibilities(Visibilities visibilities, unsigned channel, unsigned firstReceiverY, unsigned firstReceiverX, unsigned y, unsigned x, bool skipCheckY, bool skipCheckX, const Sum &sum, ScratchSpace scratchSpace[], unsigned warp) { #if defined PORTABLE store_matrix_sync(&scratchSpace[warp][0][0][0][0].x, sum, NR_RECEIVERS_PER_TCM_X * NR_POLARIZATIONS * COMPLEX, mem_row_major); @@ -297,11 +298,11 @@ __device__ inline void storeVisibilities(Visibilities visibilities, unsigned cha unsigned _x = threadIdx.x & 3; #endif - unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + _y; - unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + _x; - unsigned baseline = (recvY * (recvY + 1) / 2) + recvX; + unsigned recvY = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + _y; + unsigned recvX = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + _x; + unsigned baseline = (recvX * (recvX + 1) / 2) + recvY; - if ((skipCheckX || recvX <= recvY) && (skipCheckY || recvY < NR_RECEIVERS)) + if ((skipCheckY || recvY <= recvX) && (skipCheckX || recvX < NR_RECEIVERS)) #if NR_BITS == 4 for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++) visibilities[channel][baseline][polY][polX] = scratchSpace[warp][_y][polY][_x][polX]; @@ -312,18 +313,18 @@ __device__ inline void storeVisibilities(Visibilities visibilities, unsigned cha #endif #else #if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16) - unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4); - unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2); + unsigned recvY = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 2) + (threadIdx.x & 4); + unsigned recvX = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 2) & 2); unsigned polY = threadIdx.x & 1; unsigned polX = (threadIdx.x >> 1) & 1; #elif (__CUDA_ARCH__ == 720 && NR_BITS == 8) || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 800 || __CUDA_ARCH__ == 860 - unsigned recvY = firstReceiverY + recvYoffset + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 3); - unsigned recvX = firstReceiverX + recvXoffset + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 1) & 1); + unsigned recvY = firstReceiverY + NR_RECEIVERS_PER_TCM_Y * y + ((threadIdx.x >> 3) & 3); + unsigned recvX = firstReceiverX + NR_RECEIVERS_PER_TCM_X * x + ((threadIdx.x >> 1) & 1); unsigned polY = (threadIdx.x >> 2) & 1; unsigned polX = threadIdx.x & 1; #endif - unsigned baseline = (recvY * (recvY + 1) / 2) + recvX; + unsigned baseline = (recvX * (recvX + 1) / 2) + recvY; #if __CUDA_ARCH__ == 700 || (__CUDA_ARCH__ == 720 && NR_BITS == 16) storeVisibility(visibilities, channel, baseline, recvY, recvX, 0, 0, polY, polX, skipCheckY, skipCheckX, sum.x[0], sum.x[1]); @@ -361,9 +362,9 @@ template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities vi const uchar2 offsets[] = { make_uchar2( 0, 0), - make_uchar2( 0, 16), - make_uchar2( 0, 40), - make_uchar2(24, 40), + make_uchar2(16, 0), + make_uchar2(40, 0), + make_uchar2(40, 24), }; unsigned recvXoffset = offsets[warp].x; @@ -444,7 +445,7 @@ template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities vi for (unsigned y = 0; y < 2; y ++) { load_matrix_sync(aFrag, &bSamples[buffer][/*recvYoffset*/ 24 * z + NR_RECEIVERS_PER_TCM_Y * y][0][0][minorTime][0], sizeof(bSamples[0][0][0]) * 8 / NR_BITS); - for (unsigned x = 0; x < 8 * (y + 1) / NR_RECEIVERS_PER_TCM_X; x ++, i ++) + for (unsigned x = 8 * y / NR_RECEIVERS_PER_TCM_X; x < 16 / NR_RECEIVERS_PER_TCM_X; x ++, i ++) mma_sync(sum[i], aFrag, bFrag[x], sum[i]); } } @@ -459,21 +460,19 @@ template <bool fullTriangle> __device__ void doCorrelateTriangle(Visibilities vi if (warp != 0) for (unsigned y = 0, i = 0; y < nrFragmentsY; y ++) for (unsigned x = 0; x < nrFragmentsX; x ++, i ++) - storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, recvYoffset, recvXoffset, y, x, fullTriangle, y > 0 || x < (NR_BITS == 4 ? 8 : 4), sum[i], scratchSpace, warp); + storeVisibilities(visibilities, channel, firstReceiver + recvYoffset, firstReceiver + recvXoffset, y, x, y < 2 || x > (NR_BITS == 4 ? 4 : 2), fullTriangle, sum[i], scratchSpace, warp); else for (unsigned z = 0, i = 0; z < 3; z ++) for (unsigned y = 0; y < 2; y ++) - for (unsigned x = 0; x < (NR_BITS == 4 ? 4 : 2) * (y + 1); x ++, i ++) - storeVisibilities(visibilities, channel, firstReceiver, firstReceiver, 24 * z, 24 * z, y, x, fullTriangle, x < (NR_BITS == 4 ? 4 : 2) * y, sum[i], scratchSpace, warp); + for (unsigned x = 8 * y / NR_RECEIVERS_PER_TCM_X; x < 16 / NR_RECEIVERS_PER_TCM_X; x ++, i ++) + storeVisibilities(visibilities, channel, firstReceiver + 24 * z, firstReceiver + 24 * z, y, x, (y + 1) * NR_RECEIVERS_PER_TCM_Y <= x * NR_RECEIVERS_PER_TCM_X, fullTriangle, sum[i], scratchSpace, warp); } #endif -template <unsigned nrFragmentsY, bool skipLoadYcheck, bool skipLoadXcheck, bool skipStoreYcheck, bool skipStoreXcheck> __device__ void doCorrelateRectangle(Visibilities visibilities, const Samples samples, unsigned firstReceiverY, unsigned firstReceiverX, SharedData<>::Asamples &aSamples, SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS]) +template <unsigned nrFragmentsY, unsigned nrFragmentsX, bool skipLoadYcheck, bool skipLoadXcheck, bool skipStoreYcheck, bool skipStoreXcheck> __device__ void doCorrelateRectangle(Visibilities visibilities, const Samples samples, unsigned firstReceiverY, unsigned firstReceiverX, SharedData<>::Asamples &aSamples, SharedData<NR_RECEIVERS_PER_BLOCK_X>::Bsamples &bSamples, ScratchSpace scratchSpace[NR_WARPS]) { - const unsigned nrFragmentsX = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_X / 2 / (NR_RECEIVERS_PER_BLOCK == 64 ? 2 : 1); - Sum sum[nrFragmentsY][nrFragmentsX]; for (unsigned y = 0; y < nrFragmentsY; y ++) @@ -611,7 +610,7 @@ template <unsigned nrFragmentsY, bool skipLoadYcheck, bool skipLoadXcheck, bool for (unsigned y = 0; y < nrFragmentsY; y ++) for (unsigned x = 0; x < nrFragmentsX; x ++) - storeVisibilities(visibilities, channel, firstReceiverY, firstReceiverX, recvYoffset, recvXoffset, y, x, skipStoreYcheck, skipStoreXcheck, sum[y][x], scratchSpace, tid / warpSize); + storeVisibilities(visibilities, channel, firstReceiverY + recvYoffset, firstReceiverX + recvXoffset, y, x, skipStoreYcheck, skipStoreXcheck, sum[y][x], scratchSpace, tid / warpSize); } @@ -619,25 +618,31 @@ extern "C" __global__ __launch_bounds__(NR_WARPS * 32, NR_RECEIVERS_PER_BLOCK == 32 ? 4 : 2) void correlate(Visibilities visibilities, const Samples samples) { - const unsigned nrFragmentsY = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_Y / 2; + constexpr unsigned nrFragmentsX = NR_RECEIVERS_PER_BLOCK_X / NR_RECEIVERS_PER_TCM_X / 2; + constexpr unsigned nrFragmentsY = NR_RECEIVERS_PER_BLOCK / NR_RECEIVERS_PER_TCM_Y / 2; unsigned block = blockIdx.x; #if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48 - unsigned blockY = (unsigned) (sqrtf(8 * block + 1) - .99999f) / 2; - unsigned blockX = block - blockY * (blockY + 1) / 2; + unsigned blockX = (unsigned) (sqrtf(8 * block + 1) - .99999f) / 2; + unsigned blockY = block - blockX * (blockX + 1) / 2; + unsigned firstReceiverY = blockY * NR_RECEIVERS_PER_BLOCK; unsigned firstReceiverX = blockX * NR_RECEIVERS_PER_BLOCK; #elif NR_RECEIVERS_PER_BLOCK == 64 - unsigned blockY = (unsigned) sqrtf(block); - unsigned blockX = block - blockY * blockY; - unsigned firstReceiverX = blockX * (NR_RECEIVERS_PER_BLOCK / 2); + unsigned blockX = (unsigned) sqrtf(block); + unsigned blockY = block - blockX * blockX; + unsigned firstReceiverY = blockY / 2 * NR_RECEIVERS_PER_BLOCK; + //unsigned firstReceiverX = (2 * blockX + blockY % 2) * (NR_RECEIVERS_PER_BLOCK / 2); + unsigned firstReceiverX = blockX * NR_RECEIVERS_PER_BLOCK + (blockY % 2) * NR_RECEIVERS_PER_BLOCK / 2; + + if (firstReceiverX >= NR_RECEIVERS) + return; #endif - unsigned firstReceiverY = blockY * NR_RECEIVERS_PER_BLOCK; union shared { struct { SharedData<>::Asamples aSamples; - SharedData<NR_RECEIVERS_PER_BLOCK == 64 ? 32 : NR_RECEIVERS_PER_BLOCK>::Bsamples bSamples; + SharedData<NR_RECEIVERS_PER_BLOCK_X>::Bsamples bSamples; } rectangle; struct { SharedData<>::Bsamples samples; @@ -653,19 +658,19 @@ void correlate(Visibilities visibilities, const Samples samples) if (firstReceiverX == firstReceiverY) #if NR_RECEIVERS_PER_BLOCK == 32 || NR_RECEIVERS_PER_BLOCK == 48 - doCorrelateRectangle<nrFragmentsY, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, false>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); + doCorrelateRectangle<nrFragmentsY, nrFragmentsX, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0, false, NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK == 0>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); // TODO: smaller nrFragments[XY] #elif NR_RECEIVERS_PER_BLOCK == 64 if (NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 && (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverX >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK)) doCorrelateTriangle<false>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace); else doCorrelateTriangle<true>(visibilities, samples, firstReceiverX, 2 * threadIdx.z + threadIdx.y, 64 * threadIdx.z + 32 * threadIdx.y + threadIdx.x, u.triangle.samples, u.scratchSpace); #endif -#if NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK != 0 - else if (NR_RECEIVERS < NR_RECEIVERS_PER_BLOCK || firstReceiverY >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK * NR_RECEIVERS_PER_BLOCK) - doCorrelateRectangle<(NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK + 2 * NR_RECEIVERS_PER_TCM_Y - 1) / NR_RECEIVERS_PER_TCM_Y / 2, false, true, NR_RECEIVERS % (2 * NR_RECEIVERS_PER_TCM_Y) == 0, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); +#if NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK_X != 0 + else if (firstReceiverX >= NR_RECEIVERS / NR_RECEIVERS_PER_BLOCK_X * NR_RECEIVERS_PER_BLOCK_X) + doCorrelateRectangle<nrFragmentsY, (NR_RECEIVERS % NR_RECEIVERS_PER_BLOCK_X + 2 * NR_RECEIVERS_PER_TCM_X - 1) / (2 * NR_RECEIVERS_PER_TCM_X), true, false, true, NR_RECEIVERS % (2 * NR_RECEIVERS_PER_TCM_X) == 0>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); #endif else - doCorrelateRectangle<nrFragmentsY, true, true, true, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); + doCorrelateRectangle<nrFragmentsY, nrFragmentsX, true, true, true, true>(visibilities, samples, firstReceiverY, firstReceiverX, u.rectangle.aSamples, u.rectangle.bSamples, u.scratchSpace); #if 0 if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { diff --git a/test/CorrelatorTest/CorrelatorTest.cc b/test/CorrelatorTest/CorrelatorTest.cc index 33b33d01381d53f22fbbaee48bce2f47b73d6211..728d049c638e4bf86e49a20aec8fdb247777c977 100644 --- a/test/CorrelatorTest/CorrelatorTest.cc +++ b/test/CorrelatorTest/CorrelatorTest.cc @@ -186,25 +186,25 @@ template<typename SampleType, typename VisibilityType> void CorrelatorTest::veri for (unsigned major_time = 0; major_time < options.nrSamplesPerChannel / options.nrTimesPerBlock; major_time ++) { multi_array::array_ref<SampleType, 3> ref = samples[channel][major_time]; - for (unsigned recv1 = 0, baseline = 0; recv1 < options.nrReceivers; recv1 ++) - for (unsigned recv0 = 0; recv0 <= recv1; recv0 ++, baseline ++) + for (unsigned recvX = 0, baseline = 0; recvX < options.nrReceivers; recvX ++) + for (unsigned recvY = 0; recvY <= recvX; recvY ++, baseline ++) for (unsigned minor_time = 0; minor_time < options.nrTimesPerBlock; minor_time ++) - for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++) - for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++) { - SampleType sample0 = ref[recv0][pol0][minor_time]; - SampleType sample1 = ref[recv1][pol1][minor_time]; + for (unsigned polY = 0; polY < options.nrPolarizations; polY ++) + for (unsigned polX = 0; polX < options.nrPolarizations; polX ++) { + SampleType sampleY = ref[recvY][polY][minor_time]; + SampleType sampleX = ref[recvX][polX][minor_time]; - sum[baseline][pol1][pol0] += VisibilityType(sample1.real(), sample1.imag()) * conj(VisibilityType(sample0.real(), sample0.imag())); + sum[baseline][polY][polX] += VisibilityType(sampleY.real(), sampleY.imag()) * conj(VisibilityType(sampleX.real(), sampleX.imag())); } } for (unsigned baseline = 0; baseline < options.nrBaselines(); baseline ++) - for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++) - for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++) - if (!approximates(visibilities[channel][baseline][pol1][pol0], sum[baseline][pol1][pol0]) && ++ count < 100) + for (unsigned polY = 0; polY < options.nrPolarizations; polY ++) + for (unsigned polX = 0; polX < options.nrPolarizations; polX ++) + if (!approximates(visibilities[channel][baseline][polY][polX], sum[baseline][polY][polX]) && ++ count < 100) #pragma omp critical (cout) ep([&] () { - std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << std::endl; + std::cout << "visibilities[" << channel << "][" << baseline << "][" << polY << "][" << polX << "], expected " << sum[baseline][polY][polX] << ", got " << visibilities[channel][baseline][polY][polX] << std::endl; }); }); @@ -212,11 +212,11 @@ template<typename SampleType, typename VisibilityType> void CorrelatorTest::veri #else for (unsigned channel = 0; channel < options.nrChannels; channel ++) for (unsigned baseline = 0; baseline < options.nrBaselines(); baseline ++) - for (unsigned pol0 = 0; pol0 < options.nrPolarizations; pol0 ++) - for (unsigned pol1 = 0; pol1 < options.nrPolarizations; pol1 ++) - if (visibilities[channel][baseline][pol0][pol1] != (VisibilityType)(0, 0)) + for (unsigned polY = 0; polY < options.nrPolarizations; polY ++) + for (unsigned polX = 0; polX < options.nrPolarizations; polX ++) + if (visibilities[channel][baseline][polY][polX] != (VisibilityType)(0, 0)) if (count ++ < 10) - std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol0 << "][" << pol1 << "] = " << visibilities[channel][baseline][pol0][pol1] << std::endl; + std::cout << "visibilities[" << channel << "][" << baseline << "][" << polY << "][" << polX << "] = " << visibilities[channel][baseline][polY][polX] << std::endl; #endif } diff --git a/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc b/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc index 9099af31f97d882e73947e7f8528feefa4fdeb78..c427c0e3a7d776465d9d0451c5e9b2c832d3a87e 100644 --- a/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc +++ b/test/OpenCLCorrelatorTest/OpenCLCorrelatorTest.cc @@ -22,6 +22,10 @@ #include <omp.h> #include <sys/types.h> +#if NR_BITS == 16 +#include <cuda_fp16.h> // we do not use code, only the __half type +#endif + #define __CL_ENABLE_EXCEPTIONS #include <CL/cl.hpp> #include <CL/cl_ext.h> @@ -348,32 +352,40 @@ void checkTestPattern(cl::CommandQueue &queue, cl::Buffer &visibilitiesBuffer, c memset(sum, 0, sizeof sum); for (unsigned major_time = 0; major_time < NR_SAMPLES_PER_CHANNEL / NR_TIMES_PER_BLOCK; major_time ++) - for (unsigned recv1 = 0, baseline = 0; recv1 < NR_RECEIVERS; recv1 ++) - for (unsigned recv0 = 0; recv0 <= recv1; recv0 ++, baseline ++) + for (unsigned recvX = 0, baseline = 0; recvX < NR_RECEIVERS; recvX ++) + for (unsigned recvY = 0; recvY <= recvX; recvY ++, baseline ++) for (unsigned minor_time = 0; minor_time < NR_TIMES_PER_BLOCK; minor_time ++) - for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++) - for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++) + for (unsigned polY = 0; polY < NR_POLARIZATIONS; polY ++) + for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++) { + Sample sampleY = samples[channel][major_time][recvY][polY][minor_time]; + Sample sampleX = samples[channel][major_time][recvX][polX][minor_time]; + + sum[baseline][polY][polX] += Visibility(sampleY.real(), sampleY.imag()) * conj(Visibility(sampleX.real(), sampleX.imag())); +#if 0 #if NR_BITS == 4 || NR_BITS == 8 - sum[baseline][pol1][pol0] += (std::complex<int32_t>) samples[channel][major_time][recv1][pol1][minor_time] * conj((std::complex<int32_t>) samples[channel][major_time][recv0][pol0][minor_time]); + sum[baseline][polY][polX] += (std::complex<int32_t>) samples[channel][major_time][recvY][polY][minor_time] * conj((std::complex<int32_t>) samples[channel][major_time][recvX][polX][minor_time]); #elif NR_BITS == 16 - sum[baseline][pol1][pol0] += samples[channel][major_time][recv1][pol1][minor_time] * conj(samples[channel][major_time][recv0][pol0][minor_time]); + //sum[baseline][polY][polX] += (std::complex<float>) samples[channel][major_time][recvY][polY][minor_time] * conj((std::complex<float>) samples[channel][major_time][recvX][polX][minor_time]); + sum[baseline][polY][polX] += (std::complex<float>) samples[channel][major_time][recvY][polY][minor_time] * conj((std::complex<float>) samples[channel][major_time][recvX][polX][minor_time]); +#endif #endif + } for (unsigned baseline = 0; baseline < NR_BASELINES; baseline ++) - for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++) - for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++) + for (unsigned polY = 0; polY < NR_POLARIZATIONS; polY ++) + for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++) #if NR_BITS == 4 || NR_BITS == 8 - if (visibilities[channel][baseline][pol1][pol0] != sum[baseline][pol1][pol0] && ++ count < 10) + if (visibilities[channel][baseline][polY][polX] != sum[baseline][polY][polX] && ++ count < 10) #pragma omp critical (cout) - std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << std::endl; + std::cout << "visibilities[" << channel << "][" << baseline << "][" << polY << "][" << polX << "], expected " << sum[baseline][polY][polX] << ", got " << visibilities[channel][baseline][polY][polX] << std::endl; #elif NR_BITS == 16 - float absolute = abs(visibilities[channel][baseline][pol1][pol0] - sum[baseline][pol1][pol0]); - float relative = abs(visibilities[channel][baseline][pol1][pol0] / sum[baseline][pol1][pol0]); - 195,1 83% - + { + float absolute = abs(visibilities[channel][baseline][polY][polX] - sum[baseline][polY][polX]); + float relative = abs(visibilities[channel][baseline][polY][polX] / sum[baseline][polY][polX]); - if ((relative < .999 || relative > 1.001) && absolute > .0001 * NR_SAMPLES_PER_CHANNEL && ++ count < 10) - std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol1 << "][" << pol0 << "], expected " << sum[baseline][pol1][pol0] << ", got " << visibilities[channel][baseline][pol1][pol0] << ", relative = " << relative << ", absolute = " << absolute << std::endl; + if ((relative < .999 || relative > 1.001) && absolute > .0001 * NR_SAMPLES_PER_CHANNEL && ++ count < 10) + std::cout << "visibilities[" << channel << "][" << baseline << "][" << polY << "][" << polX << "], expected " << sum[baseline][polY][polX] << ", got " << visibilities[channel][baseline][polY][polX] << ", relative = " << relative << ", absolute = " << absolute << std::endl; + } #endif delete ∑ } @@ -384,16 +396,16 @@ void checkTestPattern(cl::CommandQueue &queue, cl::Buffer &visibilitiesBuffer, c #else for (unsigned channel = 0; channel < NR_CHANNELS; channel ++) for (unsigned baseline = 0; baseline < NR_BASELINES; baseline ++) - for (unsigned pol0 = 0; pol0 < NR_POLARIZATIONS; pol0 ++) - for (unsigned pol1 = 0; pol1 < NR_POLARIZATIONS; pol1 ++) + for (unsigned polY = 0; polY < NR_POLARIZATIONS; polY ++) + for (unsigned polX = 0; polX < NR_POLARIZATIONS; polX ++) #if NR_BITS == 4 || NR_BITS == 8 - if (visibilities[channel][baseline][pol0][pol1] != std::complex<int32_t>(0, 0)) + if (visibilities[channel][baseline][polY][polX] != std::complex<int32_t>(0, 0)) #elif NR_BITS == 16 - if (visibilities[channel][baseline][pol0][pol1] != std::complex<float>(0.0f, 0.0f)) + if (visibilities[channel][baseline][polY][polX] != std::complex<float>(0.0f, 0.0f)) #endif if (count ++ < 10) #pragma omp critical (cout) - std::cout << "visibilities[" << channel << "][" << baseline << "][" << pol0 << "][" << pol1 << "] = " << visibilities[channel][baseline][pol0][pol1] << std::endl; + std::cout << "visibilities[" << channel << "][" << baseline << "][" << polY << "][" << polX << "] = " << visibilities[channel][baseline][polY][polX] << std::endl; #endif queue.enqueueUnmapMemObject(visibilitiesBuffer, visibilities); diff --git a/test/SimpleExample/SimpleExample.cu b/test/SimpleExample/SimpleExample.cu index 4fa52ddd281d7483c89031dac05dbe1b7a669783..ba862d35587c462431b81a10d61f097cd49c826b 100644 --- a/test/SimpleExample/SimpleExample.cu +++ b/test/SimpleExample/SimpleExample.cu @@ -66,7 +66,7 @@ int main() correlator.launchAsync((CUstream) stream, (CUdeviceptr) visibilities, (CUdeviceptr) samples); checkCudaCall(cudaDeviceSynchronize()); - std::cout << ((*visibilities)[160][87745][0][0] == Visibility(23, -2) ? "success" : "failed") << std::endl; + std::cout << ((*visibilities)[160][87745][0][0] == Visibility(23, 2) ? "success" : "failed") << std::endl; checkCudaCall(cudaFree(visibilities)); checkCudaCall(cudaFree(samples));