diff --git a/.gitattributes b/.gitattributes
index d4fda8e07331c0953a0f686f6d095bdd186809e8..a2d1a346b15364f896154ceb160f1028abf4f572 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -4279,26 +4279,6 @@ RTCP/Cobalt/InputProc/test/tPacketReader.in_16bit -text
 RTCP/Cobalt/InputProc/test/tPacketReader.in_8bit -text
 RTCP/Cobalt/InputProc/test/tPacketReader.sh eol=lf
 RTCP/Cobalt/InputProc/test/t_generateRSP.sh eol=lf
-RTCP/Cobalt/OpenCL_FFT/doc/package.dox -text
-RTCP/Cobalt/OpenCL_FFT/src/AccelerateError.pdf -text
-RTCP/Cobalt/OpenCL_FFT/src/CMakeLists.txt -text
-RTCP/Cobalt/OpenCL_FFT/src/Error.pdf -text
-RTCP/Cobalt/OpenCL_FFT/src/Makefile -text
-RTCP/Cobalt/OpenCL_FFT/src/Makefile.not -text
-RTCP/Cobalt/OpenCL_FFT/src/OpenCLError.pdf -text svneol=unset#unset
-RTCP/Cobalt/OpenCL_FFT/src/OpenCL_FFT.xcodeproj/project.pbxproj -text
-RTCP/Cobalt/OpenCL_FFT/src/ReadMe.txt -text
-RTCP/Cobalt/OpenCL_FFT/src/clFFT.h -text
-RTCP/Cobalt/OpenCL_FFT/src/fft_base_kernels.h -text
-RTCP/Cobalt/OpenCL_FFT/src/fft_execute.cpp -text
-RTCP/Cobalt/OpenCL_FFT/src/fft_internal.h -text
-RTCP/Cobalt/OpenCL_FFT/src/fft_kernelstring.cpp -text
-RTCP/Cobalt/OpenCL_FFT/src/fft_kernelstring.cpp.orig -text
-RTCP/Cobalt/OpenCL_FFT/src/fft_setup.cpp -text
-RTCP/Cobalt/OpenCL_FFT/src/libOpenCL_FFT.a.not -text
-RTCP/Cobalt/OpenCL_FFT/src/main.cpp -text
-RTCP/Cobalt/OpenCL_FFT/src/param.txt -text
-RTCP/Cobalt/OpenCL_FFT/src/procs.h -text
 RTCP/Cobalt/OutputProc/doc/package.dox -text
 RTCP/Cobalt/OutputProc/etc/sudoers.d/setcap_cobalt -text
 RTCP/Cobalt/OutputProc/scripts/bf-output-loss.sh eol=lf
diff --git a/RTCP/Cobalt/OpenCL_FFT/CMakeLists.txt b/RTCP/Cobalt/OpenCL_FFT/CMakeLists.txt
deleted file mode 100644
index 7cce981f91c6fc4d9350e1f109a7a7f5fbe7fd26..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# $Id$
-
-lofar_package(OpenCL_FFT 1.0)
-
-if(USE_OPENCL)
-  include(LofarFindPackage)
-  lofar_find_package(OpenCL REQUIRED)
-  add_subdirectory(src)
-endif()
diff --git a/RTCP/Cobalt/OpenCL_FFT/doc/package.dox b/RTCP/Cobalt/OpenCL_FFT/doc/package.dox
deleted file mode 100644
index dceac79a6589c784f0c4ef6a2127c9dbdf81651a..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/doc/package.dox
+++ /dev/null
@@ -1,9 +0,0 @@
-/**
-
-\ingroup Cobalt
-\defgroup OpenCL_FFT OpenCL FFT
-
-Apple's FFT library for OpenCL
-
-*/
-
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/AccelerateError.pdf b/RTCP/Cobalt/OpenCL_FFT/src/AccelerateError.pdf
deleted file mode 100644
index 55cd832de7c8b8a1e4ce0e1a640c0d659480b69e..0000000000000000000000000000000000000000
Binary files a/RTCP/Cobalt/OpenCL_FFT/src/AccelerateError.pdf and /dev/null differ
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/CMakeLists.txt b/RTCP/Cobalt/OpenCL_FFT/src/CMakeLists.txt
deleted file mode 100644
index e683168d04b311f7b206f15c7042a41464ab00f0..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-# $Id: CMakeLists.txt 17003 2011-01-06 08:54:59Z romein $
-
-include(LofarPackageVersion)
-
-# Create symbolic link to include directory.
-execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink
-  ${CMAKE_CURRENT_SOURCE_DIR}
-  ${CMAKE_BINARY_DIR}/include/${PACKAGE_NAME})
-
-set(OpenCL_FFT_LIB_SRCS
-  #Package__Version.cc
-  fft_execute.cpp
-  fft_kernelstring.cpp
-  fft_setup.cpp)
-
-lofar_add_library(opencl_fft ${OpenCL_FFT_LIB_SRCS})
-
-#lofar_add_bin_program(versionopencl_fft versionopencl_fft.cc) #FIXME
-
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/Error.pdf b/RTCP/Cobalt/OpenCL_FFT/src/Error.pdf
deleted file mode 100644
index 1d252a9c1a0acdfac432caa11e11d705e9d8acc0..0000000000000000000000000000000000000000
Binary files a/RTCP/Cobalt/OpenCL_FFT/src/Error.pdf and /dev/null differ
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/Makefile b/RTCP/Cobalt/OpenCL_FFT/src/Makefile
deleted file mode 100644
index 3a25689ec2b15f91e630c4d731748f42f5dc41c2..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-SRCS = fft_execute.cpp fft_setup.cpp fft_kernelstring.cpp
-HEADERS = procs.h fft_internal.h fft_base_kernels.h clFFT.h
-LIBRARY = libOpenCL_FFT.a
-COMPILERFLAGS = -c -g -Wall -Werror -O3 -I..
-CFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF}
-CC = g++
-
-OBJECTS = fft_execute.o fft_setup.o fft_kernelstring.o
-TARGETOBJECT =
-all: $(LIBRARY)
-
-$(OBJECTS): $(SRCS) $(HEADERS)
-	$(CC) $(CFLAGS) $(SRCS)
-
-$(LIBRARY): $(OBJECTS)
-	ar r $@ $^
-
-clean:
-	rm -f $(TARGET) $(OBJECTS)
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/Makefile.not b/RTCP/Cobalt/OpenCL_FFT/src/Makefile.not
deleted file mode 100644
index 1c65cd84a7c59a494d0891ee9ac7443c1dbcc463..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/Makefile.not
+++ /dev/null
@@ -1,28 +0,0 @@
-ifdef BUILD_WITH_ATF
-ATF = -framework ATF
-USE_ATF = -DUSE_ATF
-endif
-
-SRCS = fft_execute.cpp fft_setup.cpp main.cpp fft_kernelstring.cpp
-HEADERS = procs.h fft_internal.h fft_base_kernels.h clFFT.h
-TARGET = test_clFFT
-COMPILERFLAGS = -c -g -Wall -Werror -O3
-CFLAGS = $(COMPILERFLAGS) ${RC_CFLAGS} ${USE_ATF}
-CC = g++
-LIBRARIES = -framework OpenCL -framework Accelerate -framework AppKit ${RC_CFLAGS} ${ATF}
-
-OBJECTS = fft_execute.o fft_setup.o main.o fft_kernelstring.o
-TARGETOBJECT =
-all: $(TARGET)
-
-$(OBJECTS): $(SRCS) $(HEADERS)
-	$(CC) $(CFLAGS) $(SRCS)
-
-$(TARGET): $(OBJECTS)
-	$(CC) $(OBJECTS) -o $@ $(LIBRARIES)
-
-clean:
-	rm -f $(TARGET) $(OBJECTS)
-
-.DEFAULT:
-	@echo The target \"$@\" does not exist in Makefile.
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/OpenCLError.pdf b/RTCP/Cobalt/OpenCL_FFT/src/OpenCLError.pdf
deleted file mode 100644
index c42309ff5196000bf9c83f5a7347564312db5a94..0000000000000000000000000000000000000000
Binary files a/RTCP/Cobalt/OpenCL_FFT/src/OpenCLError.pdf and /dev/null differ
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/OpenCL_FFT.xcodeproj/project.pbxproj b/RTCP/Cobalt/OpenCL_FFT/src/OpenCL_FFT.xcodeproj/project.pbxproj
deleted file mode 100644
index 5474161f048eb2b61197875e1ff642b9289aad9e..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/OpenCL_FFT.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,237 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 45;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		BE94A7B3108AB33000C1AD87 /* fft_kernelstring.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BE94A7B2108AB33000C1AD87 /* fft_kernelstring.cpp */; };
-		BE94A83D108AF8A100C1AD87 /* fft_setup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BE94A83C108AF8A100C1AD87 /* fft_setup.cpp */; };
-		BEE709AF1097B8DD0017B8A5 /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BEE709AE1097B8DD0017B8A5 /* main.cpp */; };
-		BEEA39EE108BD89D00729F49 /* fft_execute.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BEEA39ED108BD89D00729F49 /* fft_execute.cpp */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXCopyFilesBuildPhase section */
-		8DD76F690486A84900D96B5E /* CopyFiles */ = {
-			isa = PBXCopyFilesBuildPhase;
-			buildActionMask = 8;
-			dstPath = /usr/share/man/man1/;
-			dstSubfolderSpec = 0;
-			files = (
-			);
-			runOnlyForDeploymentPostprocessing = 1;
-		};
-/* End PBXCopyFilesBuildPhase section */
-
-/* Begin PBXFileReference section */
-		8DD76F6C0486A84900D96B5E /* OpenCL_FFT */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = OpenCL_FFT; sourceTree = BUILT_PRODUCTS_DIR; };
-		BE94A7B2108AB33000C1AD87 /* fft_kernelstring.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fft_kernelstring.cpp; sourceTree = "<group>"; };
-		BE94A7CB108AB8BF00C1AD87 /* clFFT.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = clFFT.h; sourceTree = "<group>"; };
-		BE94A7D4108ABFF000C1AD87 /* fft_internal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fft_internal.h; sourceTree = "<group>"; };
-		BE94A83C108AF8A100C1AD87 /* fft_setup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fft_setup.cpp; sourceTree = "<group>"; };
-		BE9DE3E010923A4E00940D66 /* fft_base_kernels.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fft_base_kernels.h; sourceTree = "<group>"; };
-		BE9DE4741092732C00940D66 /* param.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = param.txt; sourceTree = "<group>"; };
-		BE9DE4761092732C00940D66 /* procs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = procs.h; sourceTree = "<group>"; };
-		BEE709AE1097B8DD0017B8A5 /* main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = main.cpp; sourceTree = "<group>"; };
-		BEEA39ED108BD89D00729F49 /* fft_execute.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fft_execute.cpp; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		8DD76F660486A84900D96B5E /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		08FB7794FE84155DC02AAC07 /* OpenCL_FFT */ = {
-			isa = PBXGroup;
-			children = (
-				BE9DE4741092732C00940D66 /* param.txt */,
-				08FB7795FE84155DC02AAC07 /* Source */,
-				C6859E8C029090F304C91782 /* Documentation */,
-				1AB674ADFE9D54B511CA2CBB /* Products */,
-			);
-			name = OpenCL_FFT;
-			sourceTree = "<group>";
-		};
-		08FB7795FE84155DC02AAC07 /* Source */ = {
-			isa = PBXGroup;
-			children = (
-				BE9DE4761092732C00940D66 /* procs.h */,
-				BEEA39ED108BD89D00729F49 /* fft_execute.cpp */,
-				BE94A83C108AF8A100C1AD87 /* fft_setup.cpp */,
-				BEE709AE1097B8DD0017B8A5 /* main.cpp */,
-				BE94A7D4108ABFF000C1AD87 /* fft_internal.h */,
-				BE9DE3E010923A4E00940D66 /* fft_base_kernels.h */,
-				BE94A7CB108AB8BF00C1AD87 /* clFFT.h */,
-				BE94A7B2108AB33000C1AD87 /* fft_kernelstring.cpp */,
-			);
-			name = Source;
-			sourceTree = "<group>";
-		};
-		1AB674ADFE9D54B511CA2CBB /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				8DD76F6C0486A84900D96B5E /* OpenCL_FFT */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		C6859E8C029090F304C91782 /* Documentation */ = {
-			isa = PBXGroup;
-			children = (
-			);
-			name = Documentation;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		8DD76F620486A84900D96B5E /* OpenCL_FFT */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "OpenCL_FFT" */;
-			buildPhases = (
-				8DD76F640486A84900D96B5E /* Sources */,
-				8DD76F660486A84900D96B5E /* Frameworks */,
-				8DD76F690486A84900D96B5E /* CopyFiles */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = OpenCL_FFT;
-			productInstallPath = "$(HOME)/bin";
-			productName = OpenCL_FFT;
-			productReference = 8DD76F6C0486A84900D96B5E /* OpenCL_FFT */;
-			productType = "com.apple.product-type.tool";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		08FB7793FE84155DC02AAC07 /* Project object */ = {
-			isa = PBXProject;
-			buildConfigurationList = 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "OpenCL_FFT" */;
-			compatibilityVersion = "Xcode 3.1";
-			hasScannedForEncodings = 1;
-			mainGroup = 08FB7794FE84155DC02AAC07 /* OpenCL_FFT */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				8DD76F620486A84900D96B5E /* OpenCL_FFT */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXSourcesBuildPhase section */
-		8DD76F640486A84900D96B5E /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				BE94A7B3108AB33000C1AD87 /* fft_kernelstring.cpp in Sources */,
-				BE94A83D108AF8A100C1AD87 /* fft_setup.cpp in Sources */,
-				BEEA39EE108BD89D00729F49 /* fft_execute.cpp in Sources */,
-				BEE709AF1097B8DD0017B8A5 /* main.cpp in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		1DEB923208733DC60010E9CD /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				COPY_PHASE_STRIP = NO;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_ENABLE_FIX_AND_CONTINUE = YES;
-				GCC_MODEL_TUNING = G5;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"_GLIBCXX_DEBUG=1",
-					"_GLIBCXX_DEBUG_PEDANTIC=1",
-				);
-				INSTALL_PATH = /usr/local/bin;
-				PRODUCT_NAME = OpenCL_FFT;
-			};
-			name = Debug;
-		};
-		1DEB923308733DC60010E9CD /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				GCC_MODEL_TUNING = G5;
-				INSTALL_PATH = /usr/local/bin;
-				PRODUCT_NAME = OpenCL_FFT;
-			};
-			name = Release;
-		};
-		1DEB923608733DC60010E9CD /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ARCHS = "$(NATIVE_ARCH_ACTUAL)";
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				OTHER_LDFLAGS = (
-					"-framework",
-					OpenCL,
-					"-framework",
-					Accelerate,
-				);
-				PREBINDING = NO;
-				SDKROOT = "";
-			};
-			name = Debug;
-		};
-		1DEB923708733DC60010E9CD /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ARCHS = "$(NATIVE_ARCH_ACTUAL)";
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				OTHER_LDFLAGS = (
-					"-framework",
-					OpenCL,
-					"-framework",
-					Accelerate,
-				);
-				PREBINDING = NO;
-				SDKROOT = "";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "OpenCL_FFT" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				1DEB923208733DC60010E9CD /* Debug */,
-				1DEB923308733DC60010E9CD /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "OpenCL_FFT" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				1DEB923608733DC60010E9CD /* Debug */,
-				1DEB923708733DC60010E9CD /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = 08FB7793FE84155DC02AAC07 /* Project object */;
-}
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/ReadMe.txt b/RTCP/Cobalt/OpenCL_FFT/src/ReadMe.txt
deleted file mode 100644
index b90455fe13d5f7654f4d651fa81bf1dfc51181cd..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/ReadMe.txt
+++ /dev/null
@@ -1 +0,0 @@
-### OpenCL FFT (Fast Fourier Transform) ###

===========================================================================
DESCRIPTION:

This example shows how OpenCL can be used to compute FFT. Algorithm implemented
is described in the following references

1) Fitting FFT onto the G80 Architecture
   by Vasily Volkov and Brian Kazian
   University of California, Berkeley, May 19, 2008
   http://www.cs.berkeley.edu/~kubitron/courses/cs258-S08/projects/reports/project6_report.pdf
   
2) High Performance Discrete Fourier Tansforms on Graphics Processors
   by Naga K. Govindaraju, Brandon Lloyd, Yuri Dotsenko, Burton Smith, and John Manferdelli
   Supercomputing 2008.
   http://portal.acm.org/citation.cfm?id=1413373
   
Current version only supports power of two transform sizes however it should be straight forward
to extend the sample to non-power of two but power of base primes like 3, 5, 7. 

Current version supports 1D, 2D, 3D batched transforms. 

Current version supports both in-place and out-of-place transforms.

Current version supports both forward and inverse transform.

Current version supports both plannar and interleaved data format.

Current version only supports complex-to-complex transform. For real 
transform, one can use plannar data format with imaginary array mem set to zero. 

Current version only supports transform on GPU device. Accelerate framework can be used on CPU.

Current version supports sizes that fits in device global memory although "Twist Kernel" is 
included in fft plan if user wants to virtualize (implement sizes larger than what can fit 
in GPU global memory).

Users can dump all the kernels and global, local dimensions with which these kernels are run 
so that they can not only inspect/modify these kernels and understand how FFT is being 
computed on GPU, but also create their own stand along app for executing FFT of size of
their interest.

For any given signal size n, sample crates a clFFT_Plan, that encapsulates the kernel string, 
associated compiled cl_program. Note that kernel string is generated at runtime based on 
input size, dimension (1D, 2D, 3D) and data format (plannar or interleaved) along with some 
device depended parameters encapsulated in the clFFT_Plan. These device dependent parameters 
are set such that kernel is generated for high performance meeting following requirements

   1) Access pattern to global memory (of-chip DRAM) is such that memory transaction 
      coalesceing is achieved if device supports it thus achieving full bandwidth
   2) Local shuffles (matrix transposes or data sharing among work items of a workgroup)
      are band conflict free if local memory is banked.
   3) Kernel is fully optimized for memory hierarcy meaning that it uses GPU's large 
      vector register file, which is fastest, first before reverting to local memory 
      for data sharing among work items to save global DRAM bandwidth and only then 
      reverts to global memory if signal size is such that transform cannnot be computed
      by singal workgroup and thus require global communation among work groups.
      
Users can modify these parameters to get best performance on their particular GPU.     

Users how really want to understand the details of implementation are highly encouraged 
to read above two references but here is a high level description.
At a higher the algorithm decomposes signal of length N into factors as 

                   N = N1 x N2 x N3 x N4 x .... Nn
                   
where the factors (N1, ....., Nn) are sorted such that N1 is largest. It thus decomposes 
N into n-dimensional matrix. It than applies fft along each dimension, multiply by twiddle
factors and transposes the matrix as follow 

                      N2 x N3 x N4 x ............ x Nn x N1   (fft along N1 and transpose)
                      N3 x N4 x N5 x ....    x Nn x N2 x N1   (fft along N2 and transpose)
                      N4 x N5 x N6 x .. x Nn x N3 x N2 x N1   (fft along N3 and transpose)
                      
                      ......
                     Nn x Nn-1 x Nn-2 x ........ N3 x N2 x N1 (fft along Nn and transpose)
                     
 Decomposition is such that algorithm is fully optimized for memory hierarchy. N1 (largest
 base radix) is constrained by maximum register usage by work item (largest size of in-register 
 fft) and product N2 x N3 .... x Nn determine the maximum size of work group which is constrained
 by local memory used by work group (local memory is used to share data among work items i.e.
 local transposes). Togather these two parameters determine the maximum size fft that can be 
 computed by just using register file and local memory without reverting to global memory 
 for transpose (i.e. these sizes do not require global transpose and thus no inter work group 
 communication). However, for larger sizes, global communication among workgroup is required
 and multiple kernel launches are needed depending on the size and the base radix used. 
 
 For details of parameters user can play with, please see the comments in fft_internal.h
 and kernel_string.cpp, which has the main kernel generator functions ... especially
 see the comments preceeding function getRadixArray and getGlobalRadixInfo.
 User can adjust these parameters you achieve best performance on his device. 

Description of API Calls
=========================
clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code );

This function creates a plan and returns a handle to it for use with other functions below. 
context    context in which things are happening
n          n.x, n.y, n.z contain the dimension of signal (length along each dimension)
dim        much be one of clFFT_1D, clFFT_2D, clFFT_3D for one, two or three dimensional fft
dataFormat much be either clFFT_InterleavedComplexFormat or clFFT_SplitComplexFormat for either interleaved or plannar data (real and imaginary)
error_code pointer for getting error back in plan creation. In case of error NULL plan is returned
==========================
void clFFT_DestroyPlan( clFFT_Plan plan );

Function to release/free resources
==========================
cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, 
								 cl_mem data_in, cl_mem data_out,
								 cl_int num_events, cl_event *event_list, cl_event *event );
								 
Function for interleaved fft execution.
queue      command queue for the device on which fft needs to be executed. It should be present in the context for this plan was created
plan       fft plan that was created using clFFT_CreatePlan
batchSize  size of the batch for batched transform
dir        much be either clFFT_Forward or clFFT_Inverse for forward or inverse transform
data_in    input data
data_out   output data. For in-place transform, pass same mem object for both data_in and data_out
num_events, event_list and event are for future use for letting fft fit in other CL based application pipeline through event dependency.
Not implemented in this version yet so these parameters are redundant right now. Just pass NULL.

=========================
cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, 
							 cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
							 cl_int num_events, cl_event *event_list, cl_event *event );
							 
Same as above but for plannar data type.							 
=========================
cl_int clFFT_1DTwistInterleaved( clFFT_Plan plan, cl_mem mem, size_t numRows, size_t numCols, size_t startRow, clFFT_Direction dir );

Function for applying twist (twiddle factor multiplication) for virtualizing computation of very large ffts that cannot fit into global
memory at once but can be decomposed into many global memory fitting ffts followed by twiddle multiplication (twist) followed by transpose
followed by again many global memory fitting ffts.

=========================
cl_int clFFT_1DTwistPlanner( clFFT_Plan plan, cl_mem mem_real, cl_mem mem_imag, size_t numRows, size_t numCols, size_t startRow, clFFT_Direction dir );

Same fucntion as above but for plannar data
=========================	

void clFFT_DumpPlan( clFFT_Plan plan, FILE *file);	

Function to dump the plan. Passing stdout to file prints out the plan to standard out. It prints out
the kernel string and local, global dimension with which each kernel is executed in this plan.
						
==================================================================================
IMPORTANT NOTE ON PERFORMANCE:

Currently there are a few known performance issues (bug) that this sample has discovered
in rumtime and code generation that are being actively fixed. Hence, for sizes >= 1024, 
performance is much below the expected peak for any particular size. However, we have 
internally verified that once these bugs are fixed, performance should be on par with 
expected peak. Note that these are bugs in OpenCL runtime/compiler and not in this
sample.

===========================================================================
BUILD REQUIREMENTS:

Mac OS X v10.6 or later

If you are running in Xcode, be sure to pass file name "param.txt". You can do that
by double clicking OpenCL_FFT under executable and then click on Argument tab and 
add ./../../param.txt under "Arguments to be passed on launch" section. 

===========================================================================
RUNTIME REQUIREMENTS:

. Mac OS X v10.6 or later with OpenCL 1.0
. For good performance, device should support local memory. 
  FFT performance critically depend on how efficiently local shuffles 
  (matrix transposes) using local memory to reduce external DRAM bandwidth
  requirement.

===========================================================================
PACKAGING LIST:

AccelerateError.pdf
clFFT.h
Error.pdf
fft_base_kernels.h
fft_execute.cpp
fft_internal.h
fft_kernelstring.cpp
fft_setup.cpp
main.cpp
Makefile
OpenCL_FFT.xcodeproj
OpenCLError.pdf
param.txt
procs.h
ReadMe.txt

===========================================================================
CHANGES FROM PREVIOUS VERSIONS:

Version 1.0
- First version.

===========================================================================
Copyright (C) 2008 Apple Inc. All rights reserved.
\ No newline at end of file
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/clFFT.h b/RTCP/Cobalt/OpenCL_FFT/src/clFFT.h
deleted file mode 100644
index e893d95393a63af2a825778e3654b84c3ce62c4f..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/clFFT.h
+++ /dev/null
@@ -1,129 +0,0 @@
-
-//
-// File:       clFFT.h
-//
-// Version:    <1.0>
-//
-// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
-//             in consideration of your agreement to the following terms, and your use,
-//             installation, modification or redistribution of this Apple software
-//             constitutes acceptance of these terms.  If you do not agree with these
-//             terms, please do not use, install, modify or redistribute this Apple
-//             software.
-//
-//             In consideration of your agreement to abide by the following terms, and
-//             subject to these terms, Apple grants you a personal, non - exclusive
-//             license, under Apple's copyrights in this original Apple software ( the
-//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
-//             Software, with or without modifications, in source and / or binary forms;
-//             provided that if you redistribute the Apple Software in its entirety and
-//             without modifications, you must retain this notice and the following text
-//             and disclaimers in all such redistributions of the Apple Software. Neither
-//             the name, trademarks, service marks or logos of Apple Inc. may be used to
-//             endorse or promote products derived from the Apple Software without specific
-//             prior written permission from Apple.  Except as expressly stated in this
-//             notice, no other rights or licenses, express or implied, are granted by
-//             Apple herein, including but not limited to any patent rights that may be
-//             infringed by your derivative works or by other works in which the Apple
-//             Software may be incorporated.
-//
-//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
-//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
-//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
-//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
-//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
-//
-//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
-//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
-//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
-//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
-//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-#ifndef __CLFFT_H
-#define __CLFFT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <CL/cl.h>
-#include <stdio.h>
-
-// XForm type
-typedef enum 
-{
-	clFFT_Forward	= 	-1,
-	clFFT_Inverse	= 	 1
-	
-}clFFT_Direction;
-
-// XForm dimension
-typedef enum
-{
-	clFFT_1D	= 0,
-	clFFT_2D	= 1,
-	clFFT_3D	= 3
-	
-}clFFT_Dimension;
-
-// XForm Data type
-typedef enum
-{
-	clFFT_SplitComplexFormat       = 0,
-	clFFT_InterleavedComplexFormat = 1
-}clFFT_DataFormat;
-
-typedef struct
-{
-	unsigned int x;
-	unsigned int y;
-	unsigned int z;
-}clFFT_Dim3;	
-	
-typedef struct
-{
-	float *real;
-	float *imag;
-} clFFT_SplitComplex;
-
-typedef struct
-{
-	float real;
-	float imag;
-}clFFT_Complex;
-
-typedef void* clFFT_Plan;	
-
-clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code );
-
-void clFFT_DestroyPlan( clFFT_Plan plan );
-
-cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, 
-								 cl_mem data_in, cl_mem data_out,
-								 cl_int num_events, cl_event *event_list, cl_event *event );
-
-cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, 
-							 cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
-							 cl_int num_events, cl_event *event_list, cl_event *event );
-
-cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, 
-						        size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
-	
-
-cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, 
-					        size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
-	
-void clFFT_DumpPlan( clFFT_Plan plan, FILE *file);	
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif 
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/fft_base_kernels.h b/RTCP/Cobalt/OpenCL_FFT/src/fft_base_kernels.h
deleted file mode 100644
index 101795697f55e125e4fbafa8757aef5de033fad6..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/fft_base_kernels.h
+++ /dev/null
@@ -1,277 +0,0 @@
-
-//
-// File:       fft_base_kernels.h
-//
-// Version:    <1.0>
-//
-// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
-//             in consideration of your agreement to the following terms, and your use,
-//             installation, modification or redistribution of this Apple software
-//             constitutes acceptance of these terms.  If you do not agree with these
-//             terms, please do not use, install, modify or redistribute this Apple
-//             software.
-//
-//             In consideration of your agreement to abide by the following terms, and
-//             subject to these terms, Apple grants you a personal, non - exclusive
-//             license, under Apple's copyrights in this original Apple software ( the
-//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
-//             Software, with or without modifications, in source and / or binary forms;
-//             provided that if you redistribute the Apple Software in its entirety and
-//             without modifications, you must retain this notice and the following text
-//             and disclaimers in all such redistributions of the Apple Software. Neither
-//             the name, trademarks, service marks or logos of Apple Inc. may be used to
-//             endorse or promote products derived from the Apple Software without specific
-//             prior written permission from Apple.  Except as expressly stated in this
-//             notice, no other rights or licenses, express or implied, are granted by
-//             Apple herein, including but not limited to any patent rights that may be
-//             infringed by your derivative works or by other works in which the Apple
-//             Software may be incorporated.
-//
-//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
-//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
-//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
-//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
-//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
-//
-//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
-//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
-//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
-//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
-//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-#ifndef __CL_FFT_BASE_KERNELS_
-#define __CL_FFT_BASE_KERNELS_
-
-#include <string>
-
-using namespace std;
-
-static string baseKernels = string(
-                          "#ifndef M_PI\n"
-						  "#define M_PI 0x1.921fb54442d18p+1\n"
-						  "#endif\n"
-						  "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n"
-						  "#define conj(a) ((float2)((a).x, -(a).y))\n"
-						  "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n"		   
-						  "\n"
-						  "#define fftKernel2(a,dir) \\\n"
-						  "{ \\\n"
-						  "    float2 c = (a)[0];    \\\n"
-						  "    (a)[0] = c + (a)[1];  \\\n"
-						  "    (a)[1] = c - (a)[1];  \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define fftKernel2S(d1,d2,dir) \\\n"
-						  "{ \\\n"
-						  "    float2 c = (d1);   \\\n"
-						  "    (d1) = c + (d2);   \\\n"
-						  "    (d2) = c - (d2);   \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define fftKernel4(a,dir) \\\n"
-						  "{ \\\n"
-						  "    fftKernel2S((a)[0], (a)[2], dir); \\\n"
-						  "    fftKernel2S((a)[1], (a)[3], dir); \\\n"
-						  "    fftKernel2S((a)[0], (a)[1], dir); \\\n"
-						  "    (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
-						  "    fftKernel2S((a)[2], (a)[3], dir); \\\n"
-						  "    float2 c = (a)[1]; \\\n"
-						  "    (a)[1] = (a)[2]; \\\n"
-						  "    (a)[2] = c; \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n"
-						  "{ \\\n"
-						  "    fftKernel2S((a0), (a2), dir); \\\n"
-						  "    fftKernel2S((a1), (a3), dir); \\\n"
-						  "    fftKernel2S((a0), (a1), dir); \\\n"
-						  "    (a3) = (float2)(dir)*(conjTransp((a3))); \\\n"
-						  "    fftKernel2S((a2), (a3), dir); \\\n"
-						  "    float2 c = (a1); \\\n"
-						  "    (a1) = (a2); \\\n"
-						  "    (a2) = c; \\\n" 
-						  "}\n"
-						  "\n"						  
-						  "#define bitreverse8(a) \\\n"
-						  "{ \\\n"
-						  "    float2 c; \\\n"
-						  "    c = (a)[1]; \\\n"
-						  "    (a)[1] = (a)[4]; \\\n"
-						  "    (a)[4] = c; \\\n"
-						  "    c = (a)[3]; \\\n"
-						  "    (a)[3] = (a)[6]; \\\n"
-						  "    (a)[6] = c; \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define fftKernel8(a,dir) \\\n"
-						  "{ \\\n"
-						  "	const float2 w1  = (float2)(0x1.6a09e6p-1f,  dir*0x1.6a09e6p-1f);  \\\n"
-						  "	const float2 w3  = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f);  \\\n"
-						  "	float2 c; \\\n"
-						  "	fftKernel2S((a)[0], (a)[4], dir); \\\n"
-						  "	fftKernel2S((a)[1], (a)[5], dir); \\\n"
-						  "	fftKernel2S((a)[2], (a)[6], dir); \\\n"
-						  "	fftKernel2S((a)[3], (a)[7], dir); \\\n"
-						  "	(a)[5] = complexMul(w1, (a)[5]); \\\n"
-						  "	(a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n"
-						  "	(a)[7] = complexMul(w3, (a)[7]); \\\n"
-						  "	fftKernel2S((a)[0], (a)[2], dir); \\\n"
-						  "	fftKernel2S((a)[1], (a)[3], dir); \\\n"
-						  "	fftKernel2S((a)[4], (a)[6], dir); \\\n"
-						  "	fftKernel2S((a)[5], (a)[7], dir); \\\n"
-						  "	(a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
-						  "	(a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n"
-						  "	fftKernel2S((a)[0], (a)[1], dir); \\\n"
-						  "	fftKernel2S((a)[2], (a)[3], dir); \\\n"
-						  "	fftKernel2S((a)[4], (a)[5], dir); \\\n"
-						  "	fftKernel2S((a)[6], (a)[7], dir); \\\n"
-						  "	bitreverse8((a)); \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define bitreverse4x4(a) \\\n"
-						  "{ \\\n"
-						  "	float2 c; \\\n"
-						  "	c = (a)[1];  (a)[1]  = (a)[4];  (a)[4]  = c; \\\n"
-						  "	c = (a)[2];  (a)[2]  = (a)[8];  (a)[8]  = c; \\\n"
-						  "	c = (a)[3];  (a)[3]  = (a)[12]; (a)[12] = c; \\\n"
-						  "	c = (a)[6];  (a)[6]  = (a)[9];  (a)[9]  = c; \\\n"
-						  "	c = (a)[7];  (a)[7]  = (a)[13]; (a)[13] = c; \\\n"
-						  "	c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define fftKernel16(a,dir) \\\n"
-						  "{ \\\n"
-						  "    const float w0 = 0x1.d906bcp-1f; \\\n"
-						  "    const float w1 = 0x1.87de2ap-2f; \\\n"
-						  "    const float w2 = 0x1.6a09e6p-1f; \\\n"
-						  "    fftKernel4s((a)[0], (a)[4], (a)[8],  (a)[12], dir); \\\n"
-						  "    fftKernel4s((a)[1], (a)[5], (a)[9],  (a)[13], dir); \\\n"
-						  "    fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n"
-						  "    fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n"
-						  "    (a)[5]  = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n"
-						  "    (a)[6]  = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n"
-						  "    (a)[7]  = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n"
-						  "    (a)[9]  = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n"
-						  "    (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n"
-						  "    (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n"
-						  "    (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n"
-						  "    (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n"
-						  "    (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n"
-						  "    fftKernel4((a), dir); \\\n"
-						  "    fftKernel4((a) + 4, dir); \\\n"
-						  "    fftKernel4((a) + 8, dir); \\\n"
-						  "    fftKernel4((a) + 12, dir); \\\n"
-						  "    bitreverse4x4((a)); \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define bitreverse32(a) \\\n"
-						  "{ \\\n"
-						  "    float2 c1, c2; \\\n"
-						  "    c1 = (a)[2];   (a)[2] = (a)[1];   c2 = (a)[4];   (a)[4] = c1;   c1 = (a)[8];   (a)[8] = c2;    c2 = (a)[16];  (a)[16] = c1;   (a)[1] = c2; \\\n"
-						  "    c1 = (a)[6];   (a)[6] = (a)[3];   c2 = (a)[12];  (a)[12] = c1;  c1 = (a)[24];  (a)[24] = c2;   c2 = (a)[17];  (a)[17] = c1;   (a)[3] = c2; \\\n"
-						  "    c1 = (a)[10];  (a)[10] = (a)[5];  c2 = (a)[20];  (a)[20] = c1;  c1 = (a)[9];   (a)[9] = c2;    c2 = (a)[18];  (a)[18] = c1;   (a)[5] = c2; \\\n"
-						  "    c1 = (a)[14];  (a)[14] = (a)[7];  c2 = (a)[28];  (a)[28] = c1;  c1 = (a)[25];  (a)[25] = c2;   c2 = (a)[19];  (a)[19] = c1;   (a)[7] = c2; \\\n"
-						  "    c1 = (a)[22];  (a)[22] = (a)[11]; c2 = (a)[13];  (a)[13] = c1;  c1 = (a)[26];  (a)[26] = c2;   c2 = (a)[21];  (a)[21] = c1;   (a)[11] = c2; \\\n"
-						  "    c1 = (a)[30];  (a)[30] = (a)[15]; c2 = (a)[29];  (a)[29] = c1;  c1 = (a)[27];  (a)[27] = c2;   c2 = (a)[23];  (a)[23] = c1;   (a)[15] = c2; \\\n"
-						  "}\n"
-						  "\n"						  
-						  "#define fftKernel32(a,dir) \\\n"
-						  "{ \\\n"
-						  "    fftKernel2S((a)[0],  (a)[16], dir); \\\n"
-						  "    fftKernel2S((a)[1],  (a)[17], dir); \\\n"
-						  "    fftKernel2S((a)[2],  (a)[18], dir); \\\n"
-						  "    fftKernel2S((a)[3],  (a)[19], dir); \\\n"
-						  "    fftKernel2S((a)[4],  (a)[20], dir); \\\n"
-						  "    fftKernel2S((a)[5],  (a)[21], dir); \\\n"
-						  "    fftKernel2S((a)[6],  (a)[22], dir); \\\n"
-						  "    fftKernel2S((a)[7],  (a)[23], dir); \\\n"
-						  "    fftKernel2S((a)[8],  (a)[24], dir); \\\n"
-						  "    fftKernel2S((a)[9],  (a)[25], dir); \\\n"
-						  "    fftKernel2S((a)[10], (a)[26], dir); \\\n"
-						  "    fftKernel2S((a)[11], (a)[27], dir); \\\n"
-						  "    fftKernel2S((a)[12], (a)[28], dir); \\\n"
-						  "    fftKernel2S((a)[13], (a)[29], dir); \\\n"
-						  "    fftKernel2S((a)[14], (a)[30], dir); \\\n"
-						  "    fftKernel2S((a)[15], (a)[31], dir); \\\n"
-						  "    (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
-						  "    (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
-						  "    (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
-						  "    (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
-						  "    (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
-						  "    (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
-						  "    (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
-						  "    (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n"
-						  "    (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
-						  "    (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
-						  "    (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
-						  "    (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
-						  "    (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
-						  "    (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
-						  "    (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
-						  "    fftKernel16((a), dir); \\\n"
-						  "    fftKernel16((a) + 16, dir); \\\n"
-						  "    bitreverse32((a)); \\\n"
-						  "}\n\n"
-						  );
-
-static string twistKernelInterleaved = string(
-											  "__kernel void \\\n"
-											  "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
-											  "{ \\\n"
-											  "   float2 a, w; \\\n"
-											  "   float ang; \\\n"
-											  "   unsigned int j; \\\n"
-											  "	unsigned int i = get_global_id(0); \\\n"
-											  "	unsigned int startIndex = i; \\\n"
-											  "	 \\\n"
-											  "	if(i < numCols) \\\n"
-											  "	{ \\\n"
-											  "	    for(j = 0; j < numRowsToProcess; j++) \\\n"
-											  "	    { \\\n"
-											  "	        a = in[startIndex]; \\\n"
-											  "	        ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
-											  "	        w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
-											  "	        a = complexMul(a, w); \\\n"
-											  "	        in[startIndex] = a; \\\n"
-											  "	        startIndex += numCols; \\\n"
-											  "	    } \\\n"
-											  "	}	 \\\n"
-											  "} \\\n"
-											  );
-
-static string twistKernelPlannar = string(
-										  "__kernel void \\\n"
-										  "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
-										  "{ \\\n"
-										  "    float2 a, w; \\\n"
-										  "    float ang; \\\n"
-										  "    unsigned int j; \\\n"
-										  "	unsigned int i = get_global_id(0); \\\n"
-										  "	unsigned int startIndex = i; \\\n"
-										  "	 \\\n"
-										  "	if(i < numCols) \\\n"
-										  "	{ \\\n"
-										  "	    for(j = 0; j < numRowsToProcess; j++) \\\n"
-										  "	    { \\\n"
-										  "	        a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n"
-										  "	        ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
-										  "	        w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
-										  "	        a = complexMul(a, w); \\\n"
-										  "	        in_real[startIndex] = a.x; \\\n"
-										  "	        in_imag[startIndex] = a.y; \\\n"
-										  "	        startIndex += numCols; \\\n"
-										  "	    } \\\n"
-										  "	}	 \\\n"
-										  "} \\\n"
-										  );										  
-
-
-
-#endif
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/fft_execute.cpp b/RTCP/Cobalt/OpenCL_FFT/src/fft_execute.cpp
deleted file mode 100644
index 64dacdfbdbcec3e1fa879b9236f0015775570305..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/fft_execute.cpp
+++ /dev/null
@@ -1,405 +0,0 @@
-
-//
-// File:       fft_execute.cpp
-//
-// Version:    <1.0>
-//
-// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
-//             in consideration of your agreement to the following terms, and your use,
-//             installation, modification or redistribution of this Apple software
-//             constitutes acceptance of these terms.  If you do not agree with these
-//             terms, please do not use, install, modify or redistribute this Apple
-//             software.¬
-//
-//             In consideration of your agreement to abide by the following terms, and
-//             subject to these terms, Apple grants you a personal, non - exclusive
-//             license, under Apple's copyrights in this original Apple software ( the
-//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
-//             Software, with or without modifications, in source and / or binary forms;
-//             provided that if you redistribute the Apple Software in its entirety and
-//             without modifications, you must retain this notice and the following text
-//             and disclaimers in all such redistributions of the Apple Software. Neither
-//             the name, trademarks, service marks or logos of Apple Inc. may be used to
-//             endorse or promote products derived from the Apple Software without specific
-//             prior written permission from Apple.  Except as expressly stated in this
-//             notice, no other rights or licenses, express or implied, are granted by
-//             Apple herein, including but not limited to any patent rights that may be
-//             infringed by your derivative works or by other works in which the Apple
-//             Software may be incorporated.
-//
-//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
-//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
-//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
-//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
-//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
-//
-//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
-//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
-//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
-//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
-//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-#include "fft_internal.h"
-#include "clFFT.h"
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-
-#define max(a,b) (((a)>(b)) ? (a) : (b))
-#define min(a,b) (((a)<(b)) ? (a) : (b))
-
-static cl_int
-allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
-{
-	cl_int err = CL_SUCCESS;
-	if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) 
-	{
-		plan->last_batch_size = batchSize; 
-		size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
-		
-		if(plan->tempmemobj)
-			clReleaseMemObject(plan->tempmemobj);
-			
-		plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
-	}
-	return err;	
-}
-
-static cl_int
-allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
-{
-	cl_int err = CL_SUCCESS;
-	cl_int terr;
-	if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) 
-	{
-		plan->last_batch_size = batchSize; 
-		size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
-		
-		if(plan->tempmemobj_real)
-			clReleaseMemObject(plan->tempmemobj_real);
-
-		if(plan->tempmemobj_imag)
-			clReleaseMemObject(plan->tempmemobj_imag);			
-			
-		plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
-		plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr);
-		err |= terr;
- 	}	
-	return err;
-}
-
-void
-getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
-{
-	*lWorkItems = kernelInfo->num_workitems_per_workgroup;
-	int numWorkGroups = kernelInfo->num_workgroups;
-    int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup;
-	
-	switch(kernelInfo->dir)
-	{
-		case cl_fft_kernel_x:
-            *batchSize *= (plan->n.y * plan->n.z);
-            numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG);
-            numWorkGroups *= kernelInfo->num_workgroups;
-			break;
-		case cl_fft_kernel_y:
-			*batchSize *= plan->n.z;
-			numWorkGroups *= *batchSize;
-			break;
-		case cl_fft_kernel_z:
-			numWorkGroups *= *batchSize;
-			break;
-	}
-	
-	*gWorkItems = numWorkGroups * *lWorkItems;
-}
-
-cl_int 
-clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, 
-						 cl_mem data_in, cl_mem data_out, 
-						 cl_int num_events, cl_event *event_list, cl_event *event )
-{	
-	int s;
-	cl_fft_plan *plan = (cl_fft_plan *) Plan;
-	if(plan->format != clFFT_InterleavedComplexFormat)
-		return CL_INVALID_VALUE;
-	
-	cl_int err;
-	size_t gWorkItems, lWorkItems;
-	int inPlaceDone;
-	
-	cl_int isInPlace = data_in == data_out ? 1 : 0;
-	
-	if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
-		return err;	
-	
-	cl_mem memObj[3];
-	memObj[0] = data_in;
-	memObj[1] = data_out;
-	memObj[2] = plan->tempmemobj;
-	cl_fft_kernel_info *kernelInfo = plan->kernel_info;
-	int numKernels = plan->num_kernels;
-	
-	int numKernelsOdd = numKernels & 1;
-	int currRead  = 0;
-	int currWrite = 1;
-	
-	// at least one external dram shuffle (transpose) required
-	if(plan->temp_buffer_needed) 
-	{
-		// in-place transform
-		if(isInPlace) 
-		{
-			inPlaceDone = 0;
-			currRead  = 1;
-			currWrite = 2;
-		}
-		else
-		{
-			currWrite = (numKernels & 1) ? 1 : 2;
-		}
-		
-		while(kernelInfo) 
-		{
-			if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) 
-			{
-				currWrite = currRead;
-				inPlaceDone = 1;
-			}
-			
-			s = batchSize;
-			getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
-			err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
-			err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
-			err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
-			err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
-			
-			err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, num_events, event_list, event);
-			if(err)
-				return err;
-			
-			currRead  = (currWrite == 1) ? 1 : 2;
-			currWrite = (currWrite == 1) ? 2 : 1; 
-			
-			kernelInfo = kernelInfo->next;
-		}			
-	}
-	// no dram shuffle (transpose required) transform
-	// all kernels can execute in-place.
-	else {
-		
-		while(kernelInfo)
-		{
-		    s = batchSize;
-		    getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
-		    err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
-		    err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
-		    err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
-		    err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
-		
-		    err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, num_events, event_list, event);
-		    if(err)
-			    return err;		
-			
-			currRead  = 1;
-			currWrite = 1;
-			
-			kernelInfo = kernelInfo->next;
-		}
-	}
-	
-	return err;
-}
-
-cl_int 
-clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, 
-					  cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
-					  cl_int num_events, cl_event *event_list, cl_event *event)
-{	
-	int s;
-	cl_fft_plan *plan = (cl_fft_plan *) Plan;
-	
-	if(plan->format != clFFT_SplitComplexFormat)
-		return CL_INVALID_VALUE;
-	
-	cl_int err;
-	size_t gWorkItems, lWorkItems;
-	int inPlaceDone;
-	
-	cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
-	
-	if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
-		return err;	
-	
-	cl_mem memObj_real[3];
-	cl_mem memObj_imag[3];
-	memObj_real[0] = data_in_real;
-	memObj_real[1] = data_out_real;
-	memObj_real[2] = plan->tempmemobj_real;
-	memObj_imag[0] = data_in_imag;
-	memObj_imag[1] = data_out_imag;
-	memObj_imag[2] = plan->tempmemobj_imag;
-		
-	cl_fft_kernel_info *kernelInfo = plan->kernel_info;
-	int numKernels = plan->num_kernels;
-	
-	int numKernelsOdd = numKernels & 1;
-	int currRead  = 0;
-	int currWrite = 1;
-	
-	// at least one external dram shuffle (transpose) required
-	if(plan->temp_buffer_needed) 
-	{
-		// in-place transform
-		if(isInPlace) 
-		{
-			inPlaceDone = 0;
-			currRead  = 1;
-			currWrite = 2;
-		}
-		else
-		{
-			currWrite = (numKernels & 1) ? 1 : 2;
-		}
-		
-		while(kernelInfo) 
-		{
-			if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) 
-			{
-				currWrite = currRead;
-				inPlaceDone = 1;
-			}
-			
-			s = batchSize;
-			getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
-			err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
-			err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
-			err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
-			err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
-			err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
-			err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
-			
-			err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, num_events, event_list, event);
-			if(err)
-				return err;			
-			
-			currRead  = (currWrite == 1) ? 1 : 2;
-			currWrite = (currWrite == 1) ? 2 : 1; 
-			
-			kernelInfo = kernelInfo->next;
-		}			
-	}
-	// no dram shuffle (transpose required) transform
-	else {
-		
-		while(kernelInfo)
-		{
-		    s = batchSize;
-		    getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
-		    err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
-		    err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
-		    err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
-		    err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
-		    err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
-		    err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
-		
-		    err |= clEnqueueNDRangeKernel(queue,  kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, num_events, event_list, event);
-		    if(err)
-			    return err;	
-			
-			currRead  = 1;
-			currWrite = 1;
-		
-			kernelInfo = kernelInfo->next;
-		}
-	}
-	
-	return err;
-}
-
-cl_int 
-clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, 
-						 size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir)
-{
-	cl_fft_plan *plan = (cl_fft_plan *) Plan;
-	
-	unsigned int N = numRows*numCols;
-	unsigned int nCols = numCols;
-	unsigned int sRow = startRow;
-	unsigned int rToProcess = rowsToProcess;
-	int d = dir;
-	int err = 0;
-	
-	cl_device_id device_id;
-	err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
-	if(err)
-	    return err;
-	
-	size_t gSize;
-	err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
-	if(err)
-	    return err;
-	      
-	gSize = min(128, gSize);
-	size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
-	size_t numLocalThreads[1]  = { gSize };
-	
-	err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array);
-	err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow);
-	err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols);
-	err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N);
-	err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess);
-	err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d);
-	
-	err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);            
-	
-	return err;	
-}
-
-cl_int 
-clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, 
-					 size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir)
-{
-	cl_fft_plan *plan = (cl_fft_plan *) Plan;
-	
-	unsigned int N = numRows*numCols;
-	unsigned int nCols = numCols;
-	unsigned int sRow = startRow;
-	unsigned int rToProcess = rowsToProcess;
-	int d = dir;
-	int err = 0;
-	
-	cl_device_id device_id;
-	err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
-	if(err)
-	    return err;
-	
-	size_t gSize;
-	err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
-	if(err)
-	    return err;
-	      
-	gSize = min(128, gSize);
-	size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
-	size_t numLocalThreads[1]  = { gSize };
-	
-	err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real);
-	err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag);
-	err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow);
-	err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols);
-	err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N);
-	err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess);
-	err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d);
-	
-	err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);            
-	
-	return err;	
-}
-
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/fft_internal.h b/RTCP/Cobalt/OpenCL_FFT/src/fft_internal.h
deleted file mode 100644
index a45b69c98af037b2228aa29b4a046b1c4f1cf86f..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/fft_internal.h
+++ /dev/null
@@ -1,163 +0,0 @@
-
-//
-// File:       fft_internal.h
-//
-// Version:    <1.0>
-//
-// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
-//             in consideration of your agreement to the following terms, and your use,
-//             installation, modification or redistribution of this Apple software
-//             constitutes acceptance of these terms.  If you do not agree with these
-//             terms, please do not use, install, modify or redistribute this Apple
-//             software.
-//
-//             In consideration of your agreement to abide by the following terms, and
-//             subject to these terms, Apple grants you a personal, non - exclusive
-//             license, under Apple's copyrights in this original Apple software ( the
-//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
-//             Software, with or without modifications, in source and / or binary forms;
-//             provided that if you redistribute the Apple Software in its entirety and
-//             without modifications, you must retain this notice and the following text
-//             and disclaimers in all such redistributions of the Apple Software. Neither
-//             the name, trademarks, service marks or logos of Apple Inc. may be used to
-//             endorse or promote products derived from the Apple Software without specific
-//             prior written permission from Apple.  Except as expressly stated in this
-//             notice, no other rights or licenses, express or implied, are granted by
-//             Apple herein, including but not limited to any patent rights that may be
-//             infringed by your derivative works or by other works in which the Apple
-//             Software may be incorporated.
-//
-//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
-//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
-//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
-//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
-//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
-//
-//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
-//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
-//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
-//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
-//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-#ifndef __CLFFT_INTERNAL_H
-#define __CLFFT_INTERNAL_H
-
-#include "clFFT.h"
-#include <iostream>
-#include <string>
-#include <sstream>
-
-using namespace std;
-
-typedef enum kernel_dir_t
-{
-	cl_fft_kernel_x,
-	cl_fft_kernel_y,
-	cl_fft_kernel_z
-}cl_fft_kernel_dir;
-
-typedef struct kernel_info_t
-{
-	cl_kernel kernel;
-	char *kernel_name;
-	size_t lmem_size;
-	size_t num_workgroups;
-    size_t num_xforms_per_workgroup;
-	size_t num_workitems_per_workgroup;
-	cl_fft_kernel_dir dir;
-	int in_place_possible;
-	kernel_info_t *next;
-}cl_fft_kernel_info;
-
-typedef struct 
-{
-	// context in which fft resources are created and kernels are executed
-	cl_context              context;
-	
-	// size of signal
-	clFFT_Dim3              n;
-	
-	// dimension of transform ... must be either 1D, 2D or 3D
-	clFFT_Dimension			dim;
-	
-	// data format ... must be either interleaved or plannar
-	clFFT_DataFormat		format;
-	
-	// string containing kernel source. Generated at runtime based on
-	// n, dim, format and other parameters
-	string                  *kernel_string;
-	
-	// CL program containing source and kernel this particular 
-	// n, dim, data format
-	cl_program				program;
-	
-	// linked list of kernels which needs to be executed for this fft
-	cl_fft_kernel_info		*kernel_info;
-	
-	// number of kernels
-	int                     num_kernels;
-	
-	// twist kernel for virtualizing fft of very large sizes that do not
-	// fit in GPU global memory
-	cl_kernel				twist_kernel;
-	
-	// flag indicating if temporary intermediate buffer is needed or not.
-	// this depends on fft kernels being executed and if transform is 
-	// in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... 
-	// one that does not require global transpose do not need temporary buffer)
-	// 2D 1024x1024 out-of-place fft however do require intermediate buffer.
-	// If temp buffer is needed, its allocation is lazy i.e. its not allocated
-	// until its needed
-	cl_int                  temp_buffer_needed;
-	
-	// Batch size is runtime parameter and size of temporary buffer (if needed)
-	// depends on batch size. Allocation of temporary buffer is lazy i.e. its
-	// only created when needed. Once its created at first call of clFFT_Executexxx
-	// it is not allocated next time if next time clFFT_Executexxx is called with 
-	// batch size different than the first call. last_batch_size caches the last
-	// batch size with which this plan is used so that we dont keep allocating/deallocating
-	// temp buffer if same batch size is used again and again.
-	size_t                  last_batch_size;
-	
-	// temporary buffer for interleaved plan
-	cl_mem   				tempmemobj;
-	
-	// temporary buffer for planner plan. Only one of tempmemobj or 
-	// (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending 
-	// data format of plan (plannar or interleaved)
-	cl_mem                  tempmemobj_real, tempmemobj_imag;
-	
-	// Maximum size of signal for which local memory transposed based
-	// fft is sufficient i.e. no global mem transpose (communication)
-	// is needed
-	size_t					max_localmem_fft_size;
-	
-	// Maximum work items per work group allowed. This, along with max_radix below controls 
-	// maximum local memory being used by fft kernels of this plan. Set to 256 by default
-	size_t                  max_work_item_per_workgroup;
-	
-	// Maximum base radix for local memory fft ... this controls the maximum register 
-	// space used by work items. Currently defaults to 16
-	size_t                  max_radix;
-	
-	// Device depended parameter that tells how many work-items need to be read consecutive
-	// values to make sure global memory access by work-items of a work-group result in 
-	// coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
-	size_t                  min_mem_coalesce_width;
-	
-	// Number of local memory banks. This is used to geneate kernel with local memory 
-	// transposes with appropriate padding to avoid bank conflicts to local memory
-	// e.g. on NVidia it is 16.
-	size_t                  num_local_mem_banks;
-}cl_fft_plan;
-
-void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
-
-#endif  
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/fft_kernelstring.cpp b/RTCP/Cobalt/OpenCL_FFT/src/fft_kernelstring.cpp
deleted file mode 100644
index 71a7633a45a00a1b21c02363c0adace90cdc8c71..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/fft_kernelstring.cpp
+++ /dev/null
@@ -1,1256 +0,0 @@
-
-//
-// File:       fft_kernelstring.cpp
-//
-// Version:    <1.0>
-//
-// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
-//             in consideration of your agreement to the following terms, and your use,
-//             installation, modification or redistribution of this Apple software
-//             constitutes acceptance of these terms.  If you do not agree with these
-//             terms, please do not use, install, modify or redistribute this Apple
-//             software.
-//
-//             In consideration of your agreement to abide by the following terms, and
-//             subject to these terms, Apple grants you a personal, non - exclusive
-//             license, under Apple's copyrights in this original Apple software ( the
-//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
-//             Software, with or without modifications, in source and / or binary forms;
-//             provided that if you redistribute the Apple Software in its entirety and
-//             without modifications, you must retain this notice and the following text
-//             and disclaimers in all such redistributions of the Apple Software. Neither
-//             the name, trademarks, service marks or logos of Apple Inc. may be used to
-//             endorse or promote products derived from the Apple Software without specific
-//             prior written permission from Apple.  Except as expressly stated in this
-//             notice, no other rights or licenses, express or implied, are granted by
-//             Apple herein, including but not limited to any patent rights that may be
-//             infringed by your derivative works or by other works in which the Apple
-//             Software may be incorporated.
-//
-//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
-//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
-//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
-//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
-//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
-//
-//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
-//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
-//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
-//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
-//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-#include <cstdio>
-#include <cstdlib>
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include <cstring>
-#include <cassert>
-#include "fft_internal.h"
-#include "clFFT.h"
-
-using namespace std;
-
-#define max(A,B) ((A) > (B) ? (A) : (B))
-#define min(A,B) ((A) < (B) ? (A) : (B))
-
-static string 
-num2str(int num)
-{
-	char temp[200];
-	sprintf(temp, "%d", num);
-	return string(temp);
-}
-
-// For any n, this function decomposes n into factors for loacal memory tranpose 
-// based fft. Factors (radices) are sorted such that the first one (radixArray[0])
-// is the largest. This base radix determines the number of registers used by each
-// work item and product of remaining radices determine the size of work group needed.
-// To make things concrete with and example, suppose n = 1024. It is decomposed into
-// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and 
-// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length
-// 16 ffts (64 work items working in parallel) following by transpose using local 
-// memory followed by again 64 length 16 ffts followed by transpose using local memory
-// followed by 256 length 4 ffts. For the last step since with size of work group is 
-// 64 and each work item can array for 16 values, 64 work items can compute 256 length
-// 4 ffts by each work item computing 4 length 4 ffts. 
-// Similarly for n = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work
-// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed
-// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
-// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
-// by transpose using local memory, followed by 512 length 4 in-register ffts. Again,
-// for the last step, each work item computes two length 4 in-register ffts and thus
-// 256 work items are needed to compute all 512 ffts. 
-// For n = 32 = 8 x 4, 4 work items first compute 4 in-register 
-// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register
-// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items
-// can compute 8 length 4 ffts. However if work group size of say 64 is choosen, 
-// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform). 
-// Users can play with these parameters to figure what gives best performance on
-// their particular device i.e. some device have less register space thus using
-// smaller base radix can avoid spilling ... some has small local memory thus 
-// using smaller work group size may be required etc
-
-static void 
-getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices, unsigned int maxRadix)
-{
-    if(maxRadix > 1)
-    {
-        maxRadix = min(n, maxRadix);
-        unsigned int cnt = 0;
-        while(n > maxRadix)
-        {
-            radixArray[cnt++] = maxRadix;
-            n /= maxRadix;
-        }
-        radixArray[cnt++] = n;
-        *numRadices = cnt;
-        return;
-    }
-
-	switch(n) 
-	{
-		case 2:
-			*numRadices = 1;
-			radixArray[0] = 2;
-			break;
-			
-		case 4:
-			*numRadices = 1;
-			radixArray[0] = 4;
-			break;
-			
-		case 8:
-			*numRadices = 1;
-			radixArray[0] = 8;
-			break;
-			
-		case 16:
-			*numRadices = 2;
-			radixArray[0] = 8; radixArray[1] = 2; 
-			break;
-			
-		case 32:
-			*numRadices = 2;
-			radixArray[0] = 8; radixArray[1] = 4;
-			break;
-			
-		case 64:
-			*numRadices = 2;
-			radixArray[0] = 8; radixArray[1] = 8;
-			break;
-			
-		case 128:
-			*numRadices = 3;
-			radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4;
-			break;
-			
-		case 256:
-			*numRadices = 4;
-			radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4;
-			break;
-			
-		case 512:
-			*numRadices = 3;
-			radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8;
-			break;			
-			
-		case 1024:
-			*numRadices = 3;
-			radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4;
-			break;	
-		case 2048:
-			*numRadices = 4;
-			radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4;
-			break;
-		default:
-			*numRadices = 0;
-			return;
-	}
-}
-
-static void
-insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat)
-{
-	if(dataFormat == clFFT_SplitComplexFormat) 
-		kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
-	else 
-		kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n");
-}
-
-static void 
-insertVariables(string &kStream, int maxRadix)
-{
-	kStream += string("    int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n");
-    kStream += string("    int s, ii, jj, offset;\n");
-	kStream += string("    float2 w;\n");
-	kStream += string("    float ang, angf, ang1;\n");
-    kStream += string("    __local float *lMemStore, *lMemLoad;\n");
-    kStream += string("    float2 a[") +  num2str(maxRadix) + string("];\n");
-    kStream += string("    int lId = get_local_id( 0 );\n");
-    kStream += string("    int groupId = get_group_id( 0 );\n");
-}
-
-static void
-formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
-{
-	if(dataFormat == clFFT_InterleavedComplexFormat)
-		kernelString += string("        a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n");
-	else
-	{
-		kernelString += string("        a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n");
-		kernelString += string("        a[") + num2str(aIndex) + string("].y = in_imag[") + num2str(gIndex) + string("];\n");
-	}
-}
-
-static void
-formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
-{
-	if(dataFormat == clFFT_InterleavedComplexFormat)
-		kernelString += string("        out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n");
-	else
-	{
-		kernelString += string("        out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n");
-		kernelString += string("        out_imag[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].y;\n");
-	}
-}
-
-static int
-insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat)
-{
-	int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm);
-	int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
-	int i, j;
-	int lMemSize = 0;
-	
-	if(numXFormsPerWG > 1)
-	    kernelString += string("        s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n");
-	
-    if(numWorkItemsPerXForm >= mem_coalesce_width)
-    {   		
-		if(numXFormsPerWG > 1)
-		{
-            kernelString += string("    ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n");
-            kernelString += string("    jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
-            kernelString += string("    if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
-			kernelString += string("        offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n");
-			if(dataFormat == clFFT_InterleavedComplexFormat)
-			{
-			    kernelString += string("        in += offset;\n");
-			    kernelString += string("        out += offset;\n");
-			}
-			else
-			{
-			    kernelString += string("        in_real += offset;\n");
-				kernelString += string("        in_imag += offset;\n");
-			    kernelString += string("        out_real += offset;\n");
-				kernelString += string("        out_imag += offset;\n");
-			}
-			for(i = 0; i < R0; i++)
-				formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat);
-			kernelString += string("    }\n");
-		}
-		else
-		{
-			kernelString += string("    ii = lId;\n");
-			kernelString += string("    jj = 0;\n");
-			kernelString += string("    offset =  mad24(groupId, ") + num2str(N) + string(", ii);\n");
-			if(dataFormat == clFFT_InterleavedComplexFormat)
-			{
-			    kernelString += string("        in += offset;\n");
-			    kernelString += string("        out += offset;\n");
-			}
-			else
-			{
-			    kernelString += string("        in_real += offset;\n");
-				kernelString += string("        in_imag += offset;\n");
-			    kernelString += string("        out_real += offset;\n");
-				kernelString += string("        out_imag += offset;\n");
-			}
-			for(i = 0; i < R0; i++)
-				formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat);
-		}
-    }
-    else if( N >= mem_coalesce_width )
-    {
-        int numInnerIter = N / mem_coalesce_width;
-        int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
-		
-        kernelString += string("    ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
-        kernelString += string("    jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
-        kernelString += string("    lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
-        kernelString += string("    offset = mad24( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n");
-        kernelString += string("    offset = mad24( offset, ") + num2str(N) + string(", ii );\n");
-		if(dataFormat == clFFT_InterleavedComplexFormat)
-		{
-			kernelString += string("        in += offset;\n");
-			kernelString += string("        out += offset;\n");
-		}
-		else
-		{
-			kernelString += string("        in_real += offset;\n");
-			kernelString += string("        in_imag += offset;\n");
-			kernelString += string("        out_real += offset;\n");
-			kernelString += string("        out_imag += offset;\n");
-		}
-        
-		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
-        for(i = 0; i < numOuterIter; i++ )
-        {
-            kernelString += string("    if( jj < s ) {\n");
-			for(j = 0; j < numInnerIter; j++ ) 
-				formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat);
-			kernelString += string("    }\n"); 
-			if(i != numOuterIter - 1)
-			    kernelString += string("    jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");			 
-        }
-		kernelString += string("}\n ");
-		kernelString += string("else {\n");
-        for(i = 0; i < numOuterIter; i++ )
-        {
-			for(j = 0; j < numInnerIter; j++ ) 
-				formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat);			
-        }		
-		kernelString += string("}\n");
-        
-		kernelString += string("    ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
-		kernelString += string("    jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
-        kernelString += string("    lMemLoad  = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n");  
-		
-        for( i = 0; i < numOuterIter; i++ )
-		{
-			for( j = 0; j < numInnerIter; j++ )
-			{	
-				kernelString += string("    lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + 
-				                num2str(i * numInnerIter + j) + string("].x;\n");
-			}
-		}	
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-        
-        for( i = 0; i < R0; i++ )
-			kernelString += string("    a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");            
-		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");  
-
-	    for( i = 0; i < numOuterIter; i++ )
-		{
-			for( j = 0; j < numInnerIter; j++ )
-			{	
-				kernelString += string("    lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + 
-								num2str(i * numInnerIter + j) + string("].y;\n");
-			}
-	    }	
-		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-																						   
-		for( i = 0; i < R0; i++ )
-			kernelString += string("    a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");            
-		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");  
-		
-		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
-    }  
-    else
-    {
-        kernelString += string("    offset = mad24( groupId,  ") + num2str(N * numXFormsPerWG) + string(", lId );\n");
-		if(dataFormat == clFFT_InterleavedComplexFormat)
-		{
-			kernelString += string("        in += offset;\n");
-			kernelString += string("        out += offset;\n");
-		}
-		else
-		{
-			kernelString += string("        in_real += offset;\n");
-			kernelString += string("        in_imag += offset;\n");
-			kernelString += string("        out_real += offset;\n");
-			kernelString += string("        out_imag += offset;\n");
-		}
-        
-        kernelString += string("    ii = lId & ") + num2str(N-1) + string(";\n");
-        kernelString += string("    jj = lId >> ") + num2str((int)log2(N)) + string(";\n");
-        kernelString += string("    lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
-        
-		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
-        for( i = 0; i < R0; i++ )
-        {
-            kernelString += string("    if(jj < s )\n");
-			formattedLoad(kernelString, i, i*groupSize, dataFormat);
-			if(i != R0 - 1)
-			    kernelString += string("    jj += ") + num2str(groupSize / N) + string(";\n");
-        }
-		kernelString += string("}\n");
-		kernelString += string("else {\n");
-        for( i = 0; i < R0; i++ )
-        {
-			formattedLoad(kernelString, i, i*groupSize, dataFormat);
-        }		
-		kernelString += string("}\n");
-        
-		if(numWorkItemsPerXForm > 1)
-		{
-            kernelString += string("    ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
-            kernelString += string("    jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
-            kernelString += string("    lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); 
-		}
-		else 
-		{
-            kernelString += string("    ii = 0;\n");
-            kernelString += string("    jj = lId;\n");
-            kernelString += string("    lMemLoad = sMem + jj * ") + num2str(N + numWorkItemsPerXForm) + string(";\n"); 			
-		}
-
-		
-        for( i = 0; i < R0; i++ )
-            kernelString += string("    lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].x;\n"); 
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
-        
-        for( i = 0; i < R0; i++ )
-            kernelString += string("    a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
-		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-        
-        for( i = 0; i < R0; i++ )
-            kernelString += string("    lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n"); 
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
-        
-        for( i = 0; i < R0; i++ )
-            kernelString += string("    a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
-		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-		
-		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
-    }
-	
-	return lMemSize;
-}
-
-static int
-insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat)
-{
-	int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
-	int i, j, k, ind;
-	int lMemSize = 0;
-	int numIter = maxRadix / Nr;
-	string indent = string("");
-	
-    if( numWorkItemsPerXForm >= mem_coalesce_width )
-    {   
-		if(numXFormsPerWG > 1)
-		{
-            kernelString += string("    if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
-			indent = string("    ");
-		}	
-		for(i = 0; i < maxRadix; i++) 
-		{
-			j = i % numIter;
-			k = i / numIter;
-			ind = j * Nr + k;
-			formattedStore(kernelString, ind, i*numWorkItemsPerXForm, dataFormat);
-		}
-		if(numXFormsPerWG > 1)
-		    kernelString += string("    }\n");
-    }
-    else if( N >= mem_coalesce_width )
-    {
-        int numInnerIter = N / mem_coalesce_width;
-        int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
-		
-        kernelString += string("    lMemLoad  = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");  
-        kernelString += string("    ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
-        kernelString += string("    jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
-        kernelString += string("    lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
-		
-        for( i = 0; i < maxRadix; i++ )
-		{
-			j = i % numIter;
-			k = i / numIter;
-			ind = j * Nr + k;
-            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n");            
-		}	
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");         
-		
-        for( i = 0; i < numOuterIter; i++ )
-			for( j = 0; j < numInnerIter; j++ )
-				kernelString += string("    a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n");
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-		
-        for( i = 0; i < maxRadix; i++ )
-		{
-			j = i % numIter;
-			k = i / numIter;
-			ind = j * Nr + k;
-            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n");            
-		}	
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");         
-		
-        for( i = 0; i < numOuterIter; i++ )
-			for( j = 0; j < numInnerIter; j++ )
-				kernelString += string("    a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n");
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
-		
-		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
-		for(i = 0; i < numOuterIter; i++ )
-        {
-            kernelString += string("    if( jj < s ) {\n");
-			for(j = 0; j < numInnerIter; j++ ) 
-				formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); 
-			kernelString += string("    }\n"); 
-			if(i != numOuterIter - 1)
-			    kernelString += string("    jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");			 
-        }
-		kernelString += string("}\n");
-		kernelString += string("else {\n");
-		for(i = 0; i < numOuterIter; i++ )
-        {
-			for(j = 0; j < numInnerIter; j++ ) 
-				formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); 
-        }		
-		kernelString += string("}\n");
-		
-		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
-	}   	
-    else
-    {   
-        kernelString += string("    lMemLoad  = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");  
-        
-		kernelString += string("    ii = lId & ") + num2str(N - 1) + string(";\n");
-        kernelString += string("    jj = lId >> ") + num2str((int) log2(N)) + string(";\n");
-        kernelString += string("    lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
-        
-        for( i = 0; i < maxRadix; i++ )
-		{
-			j = i % numIter;
-			k = i / numIter;
-			ind = j * Nr + k;
-            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n");
-		}	
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-        
-        for( i = 0; i < maxRadix; i++ )
-            kernelString += string("    a[") + num2str(i) + string("].x = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); 
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
-        
-        for( i = 0; i < maxRadix; i++ )
-		{
-			j = i % numIter;
-			k = i / numIter;
-			ind = j * Nr + k;
-            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n");
-		}	
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-        
-        for( i = 0; i < maxRadix; i++ )
-            kernelString += string("    a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); 
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
-        
-		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
-		for( i = 0; i < maxRadix; i++ )
-        {
-            kernelString += string("    if(jj < s ) {\n");
-			formattedStore(kernelString, i, i*groupSize, dataFormat);
-			kernelString += string("    }\n");
-			if( i != maxRadix - 1)
-				kernelString += string("    jj +=") + num2str(groupSize / N) + string(";\n");
-        } 
-		kernelString += string("}\n");
-		kernelString += string("else {\n");
-		for( i = 0; i < maxRadix; i++ )
-        {
-			formattedStore(kernelString, i, i*groupSize, dataFormat);
-        } 		
-		kernelString += string("}\n");
-		
-		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
-    }
-	
-	return lMemSize;
-}
-
-static void 
-insertfftKernel(string &kernelString, int Nr, int numIter)
-{
-	int i;
-	for(i = 0; i < numIter; i++) 
-	{
-		kernelString += string("    fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n");
-	}
-}
-
-static void
-insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
-{
-	int z, k;
-	int logNPrev = log2(Nprev);
-	
-	for(z = 0; z < numIter; z++) 
-	{
-		if(z == 0)
-		{
-			if(Nprev > 1)
-			    kernelString += string("    angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n");
-			else
-				kernelString += string("    angf = (float) ii;\n");
-		}	
-		else
-		{
-			if(Nprev > 1)
-			    kernelString += string("    angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); 
-			else
-				kernelString += string("    angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n");
-		}	
-	
-		for(k = 1; k < Nr; k++) {
-			int ind = z*Nr + k;
-			//float fac =  (float) (2.0 * M_PI * (double) k / (double) len);
-			kernelString += string("    ang = dir * ( 2.0f * M_PI * ") + num2str(k) + string(".0f / ") + num2str(len) + string(".0f )") + string(" * angf;\n");
-			kernelString += string("    w = (float2)(native_cos(ang), native_sin(ang));\n");
-			kernelString += string("    a[") + num2str(ind) + string("] = complexMul(a[") + num2str(ind) + string("], w);\n");
-		}
-	}
-}
-
-static int
-getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad)
-{
-	if((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks))
-		*offset = 0;
-	else {
-		int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
-		int numColsReq = 1;
-		if(numRowsReq > Nr)
-			numColsReq = numRowsReq / Nr;
-		numColsReq = Nprev * numColsReq;
-		*offset = numColsReq;
-	}
-	
-	if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1)
-		*midPad = 0;
-	else {
-		int bankNum = ( (numWorkItemsReq + *offset) * Nr ) & (numBanks - 1);
-		if( bankNum >= numWorkItemsPerXForm )
-			*midPad = 0;
-		else
-			*midPad = numWorkItemsPerXForm - bankNum;
-	}
-	
-	int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1);
-	return lMemSize;
-}
-
-
-static void 
-insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
-{
-	int z, k;
-
-	for(z = 0; z < numIter; z++) {
-		for(k = 0; k < Nr; k++) {
-			int index = k*(numWorkItemsReq + offset) + z*numWorkItemsPerXForm;
-			kernelString += string("    lMemStore[") + num2str(index) + string("] = a[") + num2str(z*Nr + k) + string("].") + comp + string(";\n");
-		}
-	}
-	kernelString += string("    barrier(CLK_LOCAL_MEM_FENCE);\n");
-}
-
-static void 
-insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
-{
-	int numWorkItemsReqN = n / Nrn;										
-	int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 );			
-	int interBlockHStride = numWorkItemsPerXForm;							
-	int vertWidth = max(numWorkItemsPerXForm / Nprev, 1);					
-	vertWidth = min( vertWidth, Nr);									
-	int vertNum = Nr / vertWidth;										
-	int vertStride = ( n / Nr + offset ) * vertWidth;					
-	int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1);
-	int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1;
-	intraBlockHStride *= Nprev;
-	
-	int stride = numWorkItemsReq / Nrn;									
-	int i;
-	for(i = 0; i < iter; i++) {
-		int ii = i / (interBlockHNum * vertNum);
-		int zz = i % (interBlockHNum * vertNum);
-		int jj = zz % interBlockHNum;
-		int kk = zz / interBlockHNum;
-		int z;
-		for(z = 0; z < Nrn; z++) {
-			int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride;
-			kernelString += string("    a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n");
-		}
-	}
-	kernelString += string("    barrier(CLK_LOCAL_MEM_FENCE);\n");
-}
-
-static void
-insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
-{	
-	int Ncurr = Nprev * Nr;
-	int logNcurr = log2(Ncurr);
-	int logNprev = log2(Nprev);
-	int incr = (numWorkItemsReq + offset) * Nr + midPad;
-	
-	if(Ncurr < numWorkItemsPerXForm) 
-	{
-		if(Nprev == 1)
-		    kernelString += string("    j = ii & ") + num2str(Ncurr - 1) + string(";\n");
-		else
-			kernelString += string("    j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n");
-		
-		if(Nprev == 1) 
-			kernelString += string("    i = ii >> ") + num2str(logNcurr) + string(";\n");
-		else 
-			kernelString += string("    i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); 
-	}	
-	else 
-	{
-		if(Nprev == 1)
-		    kernelString += string("    j = ii;\n");
-		else
-			kernelString += string("    j = ii >> ") + num2str(logNprev) + string(";\n");
-		if(Nprev == 1) 
-			kernelString += string("    i = 0;\n"); 
-		else 
-			kernelString += string("    i = ii & ") + num2str(Nprev - 1) + string(";\n");
-	}
-
-    if(numXFormsPerWG > 1)
-        kernelString += string("    i = mad24(jj, ") + num2str(incr) + string(", i);\n");		
-
-    kernelString += string("    lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n"); 
-}
-
-static void
-insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad)
-{
-	if(numXFormsPerWG == 1) {
-		kernelString += string("    lMemStore = sMem + ii;\n");		
-	}
-	else {
-		kernelString += string("    lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n");	
-	}
-}
-
-
-static void
-createLocalMemfftKernelString(cl_fft_plan *plan)
-{
-	unsigned int radixArray[10];
-	unsigned int numRadix;
-	 
-	unsigned int n = plan->n.x;
-	
-	assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n");
-	
-	getRadixArray(n, radixArray, &numRadix, 0);
-	assert(numRadix > 0 && "no radix array supplied\n");
-	
-	if(n/radixArray[0] > plan->max_work_item_per_workgroup)
-	    getRadixArray(n, radixArray, &numRadix, plan->max_radix);
-
-	assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n");
-	assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n");
-	
-	unsigned int tmpLen = 1;
-	unsigned int i;
-	for(i = 0; i < numRadix; i++)
-	{	
-		assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) );
-	    tmpLen *= radixArray[i];
-	}
-	assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n");
-	
-	int offset, midPad;
-	string localString(""), kernelName("");
-	
-	clFFT_DataFormat dataFormat = plan->format;
-	string *kernelString = plan->kernel_string;
-	
-	
-	cl_fft_kernel_info **kInfo = &plan->kernel_info;
-	int kCount = 0;
-	
-	while(*kInfo)
-	{
-		kInfo = &(*kInfo)->next;
-		kCount++;
-	}
-	
-	kernelName = string("fft") + num2str(kCount);
-	
-	*kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info));
-	(*kInfo)->kernel = 0;
-	(*kInfo)->lmem_size = 0;
-	(*kInfo)->num_workgroups = 0;
-	(*kInfo)->num_workitems_per_workgroup = 0;
-	(*kInfo)->dir = cl_fft_kernel_x;
-	(*kInfo)->in_place_possible = 1;
-	(*kInfo)->next = NULL;
-	(*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
-	strcpy((*kInfo)->kernel_name, kernelName.c_str());
-	
-	unsigned int numWorkItemsPerXForm = n / radixArray[0];
-	unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm; 
-	assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup);
-	int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm;
-	(*kInfo)->num_workgroups = 1;
-    (*kInfo)->num_xforms_per_workgroup = numXFormsPerWG;
-	(*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG;
-	
-	unsigned int *N = radixArray;
-	unsigned int maxRadix = N[0];
-	unsigned int lMemSize = 0;
-		
-	insertVariables(localString, maxRadix);
-	
-	lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat);
-	(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
-	
-	string xcomp = string("x");
-	string ycomp = string("y");
-	
-	unsigned int Nprev = 1;
-	unsigned int len = n;
-	unsigned int r;
-	for(r = 0; r < numRadix; r++) 
-	{
-		int numIter = N[0] / N[r];
-		int numWorkItemsReq = n / N[r];
-		int Ncurr = Nprev * N[r];
-		insertfftKernel(localString, N[r], numIter);
-		
-		if(r < (numRadix - 1)) {
-			insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm);
-			lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad);
-			(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
-			insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], offset, midPad);
-			insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, offset, midPad);
-			insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp);
-			insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp);
-			insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp);
-			insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp);
-			Nprev = Ncurr;
-			len = len / N[r];
-		}
-	}
-	
-	lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat);
-	(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
-	
-	insertHeader(*kernelString, kernelName, dataFormat);
-	*kernelString += string("{\n");
-	if((*kInfo)->lmem_size)
-        *kernelString += string("    __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
-	*kernelString += localString;
-	*kernelString += string("}\n");
-}
-
-// For n larger than what can be computed using local memory fft, global transposes
-// multiple kernel launces is needed. For these sizes, n can be decomposed using
-// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel
-// launches will be needed, first computing 64 x 32, length 128 ffts, second computing
-// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts. 
-// Each of these base radices can futher be divided into factors so that each of these 
-// base ffts can be computed within one kernel launch using in-register ffts and local 
-// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length 
-// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length 
-// 16 ffts followed by transpose using local memory followed by each of these eight 
-// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This 
-// means only 8 work items are needed for computing one length 128 fft. If we choose
-// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one
-// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this 
-// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each 
-// work group where each work group is computing 8 length 128 ffts where each length
-// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels
-// in this example. Users can play with difference base radices and difference 
-// decompositions of base radices to generates different kernels and see which gives
-// best performance. Following function is just fixed to use 128 as base radix
-
-void
-getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
-{
-	int baseRadix = min(n, 128);
-	
-	int numR = 0;
-	int N = n;
-	while(N > baseRadix) 
-	{
-		N /= baseRadix;
-		numR++;
-	}
-	
-	for(int i = 0; i < numR; i++)
-		radix[i] = baseRadix;
-	
-	radix[numR] = N;
-	numR++;
-	*numRadices = numR;
-		
-	for(int i = 0; i < numR; i++)
-	{
-		int B = radix[i];
-		if(B <= 8)
-		{
-			R1[i] = B;
-			R2[i] = 1;
-			continue;
-		}
-		
-		int r1 = 2; 
-		int r2 = B / r1;
-	    while(r2 > r1)
-	    {
-		   r1 *=2;
-		   r2 = B / r1;
-	    }
-		R1[i] = r1;
-		R2[i] = r2;
-	}	
-}
-
-static void
-createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
-{		
-	int i, j, k, t;
-	int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-    int R1Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-    int R2Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-	int radix, R1, R2;
-	int numRadices;
-	
-	int maxThreadsPerBlock = plan->max_work_item_per_workgroup;
-	int maxArrayLen = plan->max_radix;
-	int batchSize = plan->min_mem_coalesce_width;	
-	clFFT_DataFormat dataFormat = plan->format;
-	int vertical = (dir == cl_fft_kernel_x) ? 0 : 1;	
-	
-	getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices);
-		
-	int numPasses = numRadices;
-	
-	string localString(""), kernelName("");
-	string *kernelString = plan->kernel_string;
-	cl_fft_kernel_info **kInfo = &plan->kernel_info; 
-	int kCount = 0;
-	
-	while(*kInfo)
-	{
-		kInfo = &(*kInfo)->next;
-		kCount++;
-	}
-	
-	int N = n;
-	int m = (int)log2(n);
-	int Rinit = vertical ? BS : 1;
-	batchSize = vertical ? min(BS, batchSize) : batchSize;
-	int passNum;
-	
-	for(passNum = 0; passNum < numPasses; passNum++) 
-	{
-		
-		localString.clear();
-		kernelName.clear();
-		
-		radix = radixArr[passNum];
-		R1 = R1Arr[passNum];
-		R2 = R2Arr[passNum];
-				
-		int strideI = Rinit;
-		for(i = 0; i < numPasses; i++)
-			if(i != passNum)
-				strideI *= radixArr[i];
-		
-		int strideO = Rinit;
-		for(i = 0; i < passNum; i++)
-			strideO *= radixArr[i];
-		
-		int threadsPerXForm = R2;
-		batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize;
-		batchSize = min(batchSize, strideI);
-		int threadsPerBlock = batchSize * threadsPerXForm;
-		threadsPerBlock = min(threadsPerBlock, maxThreadsPerBlock);
-		batchSize = threadsPerBlock / threadsPerXForm;
-		assert(R2 <= R1);
-		assert(R1*R2 == radix);
-		assert(R1 <= maxArrayLen);
-		assert(threadsPerBlock <= maxThreadsPerBlock);
-		
-		int numIter = R1 / R2;
-		int gInInc = threadsPerBlock / batchSize;
-		
-		
-		int lgStrideO = log2(strideO);
-		int numBlocksPerXForm = strideI / batchSize;
-		int numBlocks = numBlocksPerXForm;
-		if(!vertical)
-			numBlocks *= BS;
-		else
-			numBlocks *= vertBS;
-		
-		kernelName = string("fft") + num2str(kCount);
-		*kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info));
-		(*kInfo)->kernel = 0;
-		if(R2 == 1)
-			(*kInfo)->lmem_size = 0;
-		else
-		{
-		    if(strideO == 1)
-		        (*kInfo)->lmem_size = (radix + 1)*batchSize;
-		    else
-			    (*kInfo)->lmem_size = threadsPerBlock*R1;
-		}
-		(*kInfo)->num_workgroups = numBlocks;
-        (*kInfo)->num_xforms_per_workgroup = 1;
-		(*kInfo)->num_workitems_per_workgroup = threadsPerBlock;
-		(*kInfo)->dir = dir;
-		if( (passNum == (numPasses - 1)) && (numPasses & 1) )
-		    (*kInfo)->in_place_possible = 1;
-		else
-			(*kInfo)->in_place_possible = 0;
-		(*kInfo)->next = NULL;
-		(*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
-		strcpy((*kInfo)->kernel_name, kernelName.c_str());
-		
-		insertVariables(localString, R1);
-						
-		if(vertical) 
-		{
-			localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n");
-			localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
-			localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n");
-			localString += string("tid = groupId * ") + num2str(batchSize) + string(";\n");
-			localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
-			localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
-			int stride = radix*Rinit;
-			for(i = 0; i < passNum; i++)
-				stride *= radixArr[i];
-			localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n");
-			localString += string("bNum = groupId;\n");
-		}
-		else 
-		{
-			int lgNumBlocksPerXForm = log2(numBlocksPerXForm);
-			localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
-			localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n");
-			localString += string("indexIn = bNum * ") + num2str(batchSize) + string(";\n");
-			localString += string("tid = indexIn;\n");
-			localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
-			localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); 
-			int stride = radix*Rinit;
-			for(i = 0; i < passNum; i++)
-				stride *= radixArr[i];
-			localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n");			
-			localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
-			localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");	
-		}
-		
-		// Load Data
-		int lgBatchSize = log2(batchSize);
-		localString += string("tid = lId;\n");
-		localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n");
-		localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); 
-		localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n");
-
-		if(dataFormat == clFFT_SplitComplexFormat) 
-		{
-			localString += string("in_real += indexIn;\n");
-			localString += string("in_imag += indexIn;\n");			
-			for(j = 0; j < R1; j++)
-				localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n");
-			for(j = 0; j < R1; j++) 
-				localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n");
-		}
-		else 
-		{
-			localString += string("in += indexIn;\n");
-			for(j = 0; j < R1; j++)
-				localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n");
-	    }
-		
-		localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");							  
-		
-		if(R2 > 1)
-		{
-		    // twiddle
-		    for(k = 1; k < R1; k++) 
-		    {
-			    localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n");
-			    localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
-			    localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); 
-		    }
-		
-		    // shuffle
-		    numIter = R1 / R2;	
-		    localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n");
-		    localString += string("lMemStore = sMem + tid;\n");
-		    localString += string("lMemLoad = sMem + indexIn;\n");
-		    for(k = 0; k < R1; k++) 
-			    localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
-		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");	
-		    for(k = 0; k < numIter; k++)
-			    for(t = 0; t < R2; t++)
-				    localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
-		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
-		    for(k = 0; k < R1; k++) 
-			    localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
-		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");	
-		    for(k = 0; k < numIter; k++)
-			    for(t = 0; t < R2; t++)
-				    localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
-		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
-		
-		    for(j = 0; j < numIter; j++)
-			    localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n");
-		}
-		
-		// twiddle
-		if(passNum < (numPasses - 1)) 
-		{
-			localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n");
-			localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n"); 
-			localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n");
-			for(t = 0; t < R1; t++) 
-			{
-				localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n");
-				localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
-				localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n");
-			}
-		}
-		
-		// Store Data
-		if(strideO == 1) 
-		{
-			
-			localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n");
-			localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n");
-			
-			for(int i = 0; i < R1/R2; i++)
-				for(int j = 0; j < R2; j++)
-					localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n");
-			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
-			if(threadsPerBlock >= radix)
-            {
-                for(int i = 0; i < R1; i++)
-                localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
-            }
-            else
-            {
-                int innerIter = radix/threadsPerBlock;
-                int outerIter = R1/innerIter;
-                for(int i = 0; i < outerIter; i++)
-                    for(int j = 0; j < innerIter; j++)
-                        localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
-            }
-			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
-			
-			for(int i = 0; i < R1/R2; i++)
-				for(int j = 0; j < R2; j++)
-					localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n");
-			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
-			if(threadsPerBlock >= radix)
-            {
-                for(int i = 0; i < R1; i++)
-                    localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
-            }
-            else
-            {
-                int innerIter = radix/threadsPerBlock;
-                int outerIter = R1/innerIter;
-                for(int i = 0; i < outerIter; i++)
-                    for(int j = 0; j < innerIter; j++)
-                        localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
-            }
-			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
-			
-			localString += string("indexOut += tid;\n");
-			if(dataFormat == clFFT_SplitComplexFormat) {
-				localString += string("out_real += indexOut;\n");
-				localString += string("out_imag += indexOut;\n");
-				for(k = 0; k < R1; k++)
-					localString += string("out_real[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
-				for(k = 0; k < R1; k++)
-					localString += string("out_imag[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
-			}
-			else {
-				localString += string("out += indexOut;\n");
-				for(k = 0; k < R1; k++)
-					localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n");				
-			}
-		 
-		}
-		else 
-		{
-			localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n");
-			if(dataFormat == clFFT_SplitComplexFormat) {
-				localString += string("out_real += indexOut;\n");
-				localString += string("out_imag += indexOut;\n");			
-				for(k = 0; k < R1; k++)
-					localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n");
-				for(k = 0; k < R1; k++)
-					localString += string("out_imag[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].y;\n");
-			}
-			else {
-				localString += string("out += indexOut;\n");
-				for(k = 0; k < R1; k++)
-					localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n");
-			}
-		}
-		
-		insertHeader(*kernelString, kernelName, dataFormat);
-		*kernelString += string("{\n");
-		if((*kInfo)->lmem_size)
-			*kernelString += string("    __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
-		*kernelString += localString;
-		*kernelString += string("}\n");		
-		
-		N /= radix;
-		kInfo = &(*kInfo)->next;
-		kCount++;
-	}
-}
-
-void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
-{	
-    unsigned int radixArray[10];
-    unsigned int numRadix;
-    
-	switch(dir)
-	{
-		case cl_fft_kernel_x:
-		    if(plan->n.x > plan->max_localmem_fft_size)
-		    {
-		        createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
-		    }
-		    else if(plan->n.x > 1)
-		    {
-		        getRadixArray(plan->n.x, radixArray, &numRadix, 0);
-		        if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
-		        {
-				    createLocalMemfftKernelString(plan);
-				}
-			    else
-			    {
-			        getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix);
-			        if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
-			            createLocalMemfftKernelString(plan);
-			        else
-				        createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
-				}
-		    }
-			break;
-			
-		case cl_fft_kernel_y:
-			if(plan->n.y > 1)
-			    createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1);
-			break;
-			
-		case cl_fft_kernel_z:
-			if(plan->n.z > 1)
-			    createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1);
-		default:
-			return;
-	}
-}
-
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/fft_kernelstring.cpp.orig b/RTCP/Cobalt/OpenCL_FFT/src/fft_kernelstring.cpp.orig
deleted file mode 100644
index bbb9298d921528032397da8a71cdafa5502a421e..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/fft_kernelstring.cpp.orig
+++ /dev/null
@@ -1,1256 +0,0 @@
-
-//
-// File:       fft_kernelstring.cpp
-//
-// Version:    <1.0>
-//
-// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
-//             in consideration of your agreement to the following terms, and your use,
-//             installation, modification or redistribution of this Apple software
-//             constitutes acceptance of these terms.  If you do not agree with these
-//             terms, please do not use, install, modify or redistribute this Apple
-//             software.
-//
-//             In consideration of your agreement to abide by the following terms, and
-//             subject to these terms, Apple grants you a personal, non - exclusive
-//             license, under Apple's copyrights in this original Apple software ( the
-//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
-//             Software, with or without modifications, in source and / or binary forms;
-//             provided that if you redistribute the Apple Software in its entirety and
-//             without modifications, you must retain this notice and the following text
-//             and disclaimers in all such redistributions of the Apple Software. Neither
-//             the name, trademarks, service marks or logos of Apple Inc. may be used to
-//             endorse or promote products derived from the Apple Software without specific
-//             prior written permission from Apple.  Except as expressly stated in this
-//             notice, no other rights or licenses, express or implied, are granted by
-//             Apple herein, including but not limited to any patent rights that may be
-//             infringed by your derivative works or by other works in which the Apple
-//             Software may be incorporated.
-//
-//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
-//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
-//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
-//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
-//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
-//
-//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
-//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
-//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
-//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
-//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-#include <cstdio>
-#include <cstdlib>
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include <cstring>
-#include <cassert>
-#include "fft_internal.h"
-#include "clFFT.h"
-
-using namespace std;
-
-#define max(A,B) ((A) > (B) ? (A) : (B))
-#define min(A,B) ((A) < (B) ? (A) : (B))
-
-static string 
-num2str(int num)
-{
-	char temp[200];
-	sprintf(temp, "%d", num);
-	return string(temp);
-}
-
-// For any n, this function decomposes n into factors for loacal memory tranpose 
-// based fft. Factors (radices) are sorted such that the first one (radixArray[0])
-// is the largest. This base radix determines the number of registers used by each
-// work item and product of remaining radices determine the size of work group needed.
-// To make things concrete with and example, suppose n = 1024. It is decomposed into
-// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and 
-// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length
-// 16 ffts (64 work items working in parallel) following by transpose using local 
-// memory followed by again 64 length 16 ffts followed by transpose using local memory
-// followed by 256 length 4 ffts. For the last step since with size of work group is 
-// 64 and each work item can array for 16 values, 64 work items can compute 256 length
-// 4 ffts by each work item computing 4 length 4 ffts. 
-// Similarly for n = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work
-// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed
-// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
-// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
-// by transpose using local memory, followed by 512 length 4 in-register ffts. Again,
-// for the last step, each work item computes two length 4 in-register ffts and thus
-// 256 work items are needed to compute all 512 ffts. 
-// For n = 32 = 8 x 4, 4 work items first compute 4 in-register 
-// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register
-// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items
-// can compute 8 length 4 ffts. However if work group size of say 64 is choosen, 
-// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform). 
-// Users can play with these parameters to figure what gives best performance on
-// their particular device i.e. some device have less register space thus using
-// smaller base radix can avoid spilling ... some has small local memory thus 
-// using smaller work group size may be required etc
-
-static void 
-getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices, unsigned int maxRadix)
-{
-    if(maxRadix > 1)
-    {
-        maxRadix = min(n, maxRadix);
-        unsigned int cnt = 0;
-        while(n > maxRadix)
-        {
-            radixArray[cnt++] = maxRadix;
-            n /= maxRadix;
-        }
-        radixArray[cnt++] = n;
-        *numRadices = cnt;
-        return;
-    }
-
-	switch(n) 
-	{
-		case 2:
-			*numRadices = 1;
-			radixArray[0] = 2;
-			break;
-			
-		case 4:
-			*numRadices = 1;
-			radixArray[0] = 4;
-			break;
-			
-		case 8:
-			*numRadices = 1;
-			radixArray[0] = 8;
-			break;
-			
-		case 16:
-			*numRadices = 2;
-			radixArray[0] = 8; radixArray[1] = 2; 
-			break;
-			
-		case 32:
-			*numRadices = 2;
-			radixArray[0] = 8; radixArray[1] = 4;
-			break;
-			
-		case 64:
-			*numRadices = 2;
-			radixArray[0] = 8; radixArray[1] = 8;
-			break;
-			
-		case 128:
-			*numRadices = 3;
-			radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4;
-			break;
-			
-		case 256:
-			*numRadices = 4;
-			radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4;
-			break;
-			
-		case 512:
-			*numRadices = 3;
-			radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8;
-			break;			
-			
-		case 1024:
-			*numRadices = 3;
-			radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4;
-			break;	
-		case 2048:
-			*numRadices = 4;
-			radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4;
-			break;
-		default:
-			*numRadices = 0;
-			return;
-	}
-}
-
-static void
-insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat)
-{
-	if(dataFormat == clFFT_SplitComplexFormat) 
-		kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
-	else 
-		kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n");
-}
-
-static void 
-insertVariables(string &kStream, int maxRadix)
-{
-	kStream += string("    int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n");
-    kStream += string("    int s, ii, jj, offset;\n");
-	kStream += string("    float2 w;\n");
-	kStream += string("    float ang, angf, ang1;\n");
-    kStream += string("    __local float *lMemStore, *lMemLoad;\n");
-    kStream += string("    float2 a[") +  num2str(maxRadix) + string("];\n");
-    kStream += string("    int lId = get_local_id( 0 );\n");
-    kStream += string("    int groupId = get_group_id( 0 );\n");
-}
-
-static void
-formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
-{
-	if(dataFormat == clFFT_InterleavedComplexFormat)
-		kernelString += string("        a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n");
-	else
-	{
-		kernelString += string("        a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n");
-		kernelString += string("        a[") + num2str(aIndex) + string("].y = in_imag[") + num2str(gIndex) + string("];\n");
-	}
-}
-
-static void
-formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
-{
-	if(dataFormat == clFFT_InterleavedComplexFormat)
-		kernelString += string("        out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n");
-	else
-	{
-		kernelString += string("        out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n");
-		kernelString += string("        out_imag[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].y;\n");
-	}
-}
-
-static int
-insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat)
-{
-	int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm);
-	int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
-	int i, j;
-	int lMemSize = 0;
-	
-	if(numXFormsPerWG > 1)
-	    kernelString += string("        s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n");
-	
-    if(numWorkItemsPerXForm >= mem_coalesce_width)
-    {   		
-		if(numXFormsPerWG > 1)
-		{
-            kernelString += string("    ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n");
-            kernelString += string("    jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
-            kernelString += string("    if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
-			kernelString += string("        offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n");
-			if(dataFormat == clFFT_InterleavedComplexFormat)
-			{
-			    kernelString += string("        in += offset;\n");
-			    kernelString += string("        out += offset;\n");
-			}
-			else
-			{
-			    kernelString += string("        in_real += offset;\n");
-				kernelString += string("        in_imag += offset;\n");
-			    kernelString += string("        out_real += offset;\n");
-				kernelString += string("        out_imag += offset;\n");
-			}
-			for(i = 0; i < R0; i++)
-				formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat);
-			kernelString += string("    }\n");
-		}
-		else
-		{
-			kernelString += string("    ii = lId;\n");
-			kernelString += string("    jj = 0;\n");
-			kernelString += string("    offset =  mad24(groupId, ") + num2str(N) + string(", ii);\n");
-			if(dataFormat == clFFT_InterleavedComplexFormat)
-			{
-			    kernelString += string("        in += offset;\n");
-			    kernelString += string("        out += offset;\n");
-			}
-			else
-			{
-			    kernelString += string("        in_real += offset;\n");
-				kernelString += string("        in_imag += offset;\n");
-			    kernelString += string("        out_real += offset;\n");
-				kernelString += string("        out_imag += offset;\n");
-			}
-			for(i = 0; i < R0; i++)
-				formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat);
-		}
-    }
-    else if( N >= mem_coalesce_width )
-    {
-        int numInnerIter = N / mem_coalesce_width;
-        int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
-		
-        kernelString += string("    ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
-        kernelString += string("    jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
-        kernelString += string("    lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
-        kernelString += string("    offset = mad24( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n");
-        kernelString += string("    offset = mad24( offset, ") + num2str(N) + string(", ii );\n");
-		if(dataFormat == clFFT_InterleavedComplexFormat)
-		{
-			kernelString += string("        in += offset;\n");
-			kernelString += string("        out += offset;\n");
-		}
-		else
-		{
-			kernelString += string("        in_real += offset;\n");
-			kernelString += string("        in_imag += offset;\n");
-			kernelString += string("        out_real += offset;\n");
-			kernelString += string("        out_imag += offset;\n");
-		}
-        
-		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
-        for(i = 0; i < numOuterIter; i++ )
-        {
-            kernelString += string("    if( jj < s ) {\n");
-			for(j = 0; j < numInnerIter; j++ ) 
-				formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat);
-			kernelString += string("    }\n"); 
-			if(i != numOuterIter - 1)
-			    kernelString += string("    jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");			 
-        }
-		kernelString += string("}\n ");
-		kernelString += string("else {\n");
-        for(i = 0; i < numOuterIter; i++ )
-        {
-			for(j = 0; j < numInnerIter; j++ ) 
-				formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat);			
-        }		
-		kernelString += string("}\n");
-        
-		kernelString += string("    ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
-		kernelString += string("    jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
-        kernelString += string("    lMemLoad  = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n");  
-		
-        for( i = 0; i < numOuterIter; i++ )
-		{
-			for( j = 0; j < numInnerIter; j++ )
-			{	
-				kernelString += string("    lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + 
-				                num2str(i * numInnerIter + j) + string("].x;\n");
-			}
-		}	
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-        
-        for( i = 0; i < R0; i++ )
-			kernelString += string("    a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");            
-		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");  
-
-	    for( i = 0; i < numOuterIter; i++ )
-		{
-			for( j = 0; j < numInnerIter; j++ )
-			{	
-				kernelString += string("    lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + 
-								num2str(i * numInnerIter + j) + string("].y;\n");
-			}
-	    }	
-		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-																						   
-		for( i = 0; i < R0; i++ )
-			kernelString += string("    a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");            
-		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");  
-		
-		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
-    }  
-    else
-    {
-        kernelString += string("    offset = mad24( groupId,  ") + num2str(N * numXFormsPerWG) + string(", lId );\n");
-		if(dataFormat == clFFT_InterleavedComplexFormat)
-		{
-			kernelString += string("        in += offset;\n");
-			kernelString += string("        out += offset;\n");
-		}
-		else
-		{
-			kernelString += string("        in_real += offset;\n");
-			kernelString += string("        in_imag += offset;\n");
-			kernelString += string("        out_real += offset;\n");
-			kernelString += string("        out_imag += offset;\n");
-		}
-        
-        kernelString += string("    ii = lId & ") + num2str(N-1) + string(";\n");
-        kernelString += string("    jj = lId >> ") + num2str((int)log2(N)) + string(";\n");
-        kernelString += string("    lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
-        
-		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
-        for( i = 0; i < R0; i++ )
-        {
-            kernelString += string("    if(jj < s )\n");
-			formattedLoad(kernelString, i, i*groupSize, dataFormat);
-			if(i != R0 - 1)
-			    kernelString += string("    jj += ") + num2str(groupSize / N) + string(";\n");
-        }
-		kernelString += string("}\n");
-		kernelString += string("else {\n");
-        for( i = 0; i < R0; i++ )
-        {
-			formattedLoad(kernelString, i, i*groupSize, dataFormat);
-        }		
-		kernelString += string("}\n");
-        
-		if(numWorkItemsPerXForm > 1)
-		{
-            kernelString += string("    ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
-            kernelString += string("    jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
-            kernelString += string("    lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); 
-		}
-		else 
-		{
-            kernelString += string("    ii = 0;\n");
-            kernelString += string("    jj = lId;\n");
-            kernelString += string("    lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n"); 			
-		}
-
-		
-        for( i = 0; i < R0; i++ )
-            kernelString += string("    lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].x;\n"); 
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
-        
-        for( i = 0; i < R0; i++ )
-            kernelString += string("    a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
-		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-        
-        for( i = 0; i < R0; i++ )
-            kernelString += string("    lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n"); 
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
-        
-        for( i = 0; i < R0; i++ )
-            kernelString += string("    a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
-		kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-		
-		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
-    }
-	
-	return lMemSize;
-}
-
-static int
-insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat)
-{
-	int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
-	int i, j, k, ind;
-	int lMemSize = 0;
-	int numIter = maxRadix / Nr;
-	string indent = string("");
-	
-    if( numWorkItemsPerXForm >= mem_coalesce_width )
-    {   
-		if(numXFormsPerWG > 1)
-		{
-            kernelString += string("    if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
-			indent = string("    ");
-		}	
-		for(i = 0; i < maxRadix; i++) 
-		{
-			j = i % numIter;
-			k = i / numIter;
-			ind = j * Nr + k;
-			formattedStore(kernelString, ind, i*numWorkItemsPerXForm, dataFormat);
-		}
-		if(numXFormsPerWG > 1)
-		    kernelString += string("    }\n");
-    }
-    else if( N >= mem_coalesce_width )
-    {
-        int numInnerIter = N / mem_coalesce_width;
-        int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
-		
-        kernelString += string("    lMemLoad  = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");  
-        kernelString += string("    ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
-        kernelString += string("    jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
-        kernelString += string("    lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
-		
-        for( i = 0; i < maxRadix; i++ )
-		{
-			j = i % numIter;
-			k = i / numIter;
-			ind = j * Nr + k;
-            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n");            
-		}	
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");         
-		
-        for( i = 0; i < numOuterIter; i++ )
-			for( j = 0; j < numInnerIter; j++ )
-				kernelString += string("    a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n");
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-		
-        for( i = 0; i < maxRadix; i++ )
-		{
-			j = i % numIter;
-			k = i / numIter;
-			ind = j * Nr + k;
-            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n");            
-		}	
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");         
-		
-        for( i = 0; i < numOuterIter; i++ )
-			for( j = 0; j < numInnerIter; j++ )
-				kernelString += string("    a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n");
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
-		
-		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
-		for(i = 0; i < numOuterIter; i++ )
-        {
-            kernelString += string("    if( jj < s ) {\n");
-			for(j = 0; j < numInnerIter; j++ ) 
-				formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); 
-			kernelString += string("    }\n"); 
-			if(i != numOuterIter - 1)
-			    kernelString += string("    jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");			 
-        }
-		kernelString += string("}\n");
-		kernelString += string("else {\n");
-		for(i = 0; i < numOuterIter; i++ )
-        {
-			for(j = 0; j < numInnerIter; j++ ) 
-				formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); 
-        }		
-		kernelString += string("}\n");
-		
-		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
-	}   	
-    else
-    {   
-        kernelString += string("    lMemLoad  = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");  
-        
-		kernelString += string("    ii = lId & ") + num2str(N - 1) + string(";\n");
-        kernelString += string("    jj = lId >> ") + num2str((int) log2(N)) + string(";\n");
-        kernelString += string("    lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
-        
-        for( i = 0; i < maxRadix; i++ )
-		{
-			j = i % numIter;
-			k = i / numIter;
-			ind = j * Nr + k;
-            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n");
-		}	
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-        
-        for( i = 0; i < maxRadix; i++ )
-            kernelString += string("    a[") + num2str(i) + string("].x = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); 
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
-        
-        for( i = 0; i < maxRadix; i++ )
-		{
-			j = i % numIter;
-			k = i / numIter;
-			ind = j * Nr + k;
-            kernelString += string("    lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n");
-		}	
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n");
-        
-        for( i = 0; i < maxRadix; i++ )
-            kernelString += string("    a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); 
-        kernelString += string("    barrier( CLK_LOCAL_MEM_FENCE );\n"); 
-        
-		kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
-		for( i = 0; i < maxRadix; i++ )
-        {
-            kernelString += string("    if(jj < s ) {\n");
-			formattedStore(kernelString, i, i*groupSize, dataFormat);
-			kernelString += string("    }\n");
-			if( i != maxRadix - 1)
-				kernelString += string("    jj +=") + num2str(groupSize / N) + string(";\n");
-        } 
-		kernelString += string("}\n");
-		kernelString += string("else {\n");
-		for( i = 0; i < maxRadix; i++ )
-        {
-			formattedStore(kernelString, i, i*groupSize, dataFormat);
-        } 		
-		kernelString += string("}\n");
-		
-		lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
-    }
-	
-	return lMemSize;
-}
-
-static void 
-insertfftKernel(string &kernelString, int Nr, int numIter)
-{
-	int i;
-	for(i = 0; i < numIter; i++) 
-	{
-		kernelString += string("    fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n");
-	}
-}
-
-static void
-insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
-{
-	int z, k;
-	int logNPrev = log2(Nprev);
-	
-	for(z = 0; z < numIter; z++) 
-	{
-		if(z == 0)
-		{
-			if(Nprev > 1)
-			    kernelString += string("    angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n");
-			else
-				kernelString += string("    angf = (float) ii;\n");
-		}	
-		else
-		{
-			if(Nprev > 1)
-			    kernelString += string("    angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); 
-			else
-				kernelString += string("    angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n");
-		}	
-	
-		for(k = 1; k < Nr; k++) {
-			int ind = z*Nr + k;
-			//float fac =  (float) (2.0 * M_PI * (double) k / (double) len);
-			kernelString += string("    ang = dir * ( 2.0f * M_PI * ") + num2str(k) + string(".0f / ") + num2str(len) + string(".0f )") + string(" * angf;\n");
-			kernelString += string("    w = (float2)(native_cos(ang), native_sin(ang));\n");
-			kernelString += string("    a[") + num2str(ind) + string("] = complexMul(a[") + num2str(ind) + string("], w);\n");
-		}
-	}
-}
-
-static int
-getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad)
-{
-	if((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks))
-		*offset = 0;
-	else {
-		int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
-		int numColsReq = 1;
-		if(numRowsReq > Nr)
-			numColsReq = numRowsReq / Nr;
-		numColsReq = Nprev * numColsReq;
-		*offset = numColsReq;
-	}
-	
-	if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1)
-		*midPad = 0;
-	else {
-		int bankNum = ( (numWorkItemsReq + *offset) * Nr ) & (numBanks - 1);
-		if( bankNum >= numWorkItemsPerXForm )
-			*midPad = 0;
-		else
-			*midPad = numWorkItemsPerXForm - bankNum;
-	}
-	
-	int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1);
-	return lMemSize;
-}
-
-
-static void 
-insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
-{
-	int z, k;
-
-	for(z = 0; z < numIter; z++) {
-		for(k = 0; k < Nr; k++) {
-			int index = k*(numWorkItemsReq + offset) + z*numWorkItemsPerXForm;
-			kernelString += string("    lMemStore[") + num2str(index) + string("] = a[") + num2str(z*Nr + k) + string("].") + comp + string(";\n");
-		}
-	}
-	kernelString += string("    barrier(CLK_LOCAL_MEM_FENCE);\n");
-}
-
-static void 
-insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
-{
-	int numWorkItemsReqN = n / Nrn;										
-	int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 );			
-	int interBlockHStride = numWorkItemsPerXForm;							
-	int vertWidth = max(numWorkItemsPerXForm / Nprev, 1);					
-	vertWidth = min( vertWidth, Nr);									
-	int vertNum = Nr / vertWidth;										
-	int vertStride = ( n / Nr + offset ) * vertWidth;					
-	int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1);
-	int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1;
-	intraBlockHStride *= Nprev;
-	
-	int stride = numWorkItemsReq / Nrn;									
-	int i;
-	for(i = 0; i < iter; i++) {
-		int ii = i / (interBlockHNum * vertNum);
-		int zz = i % (interBlockHNum * vertNum);
-		int jj = zz % interBlockHNum;
-		int kk = zz / interBlockHNum;
-		int z;
-		for(z = 0; z < Nrn; z++) {
-			int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride;
-			kernelString += string("    a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n");
-		}
-	}
-	kernelString += string("    barrier(CLK_LOCAL_MEM_FENCE);\n");
-}
-
-static void
-insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
-{	
-	int Ncurr = Nprev * Nr;
-	int logNcurr = log2(Ncurr);
-	int logNprev = log2(Nprev);
-	int incr = (numWorkItemsReq + offset) * Nr + midPad;
-	
-	if(Ncurr < numWorkItemsPerXForm) 
-	{
-		if(Nprev == 1)
-		    kernelString += string("    j = ii & ") + num2str(Ncurr - 1) + string(";\n");
-		else
-			kernelString += string("    j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n");
-		
-		if(Nprev == 1) 
-			kernelString += string("    i = ii >> ") + num2str(logNcurr) + string(";\n");
-		else 
-			kernelString += string("    i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); 
-	}	
-	else 
-	{
-		if(Nprev == 1)
-		    kernelString += string("    j = ii;\n");
-		else
-			kernelString += string("    j = ii >> ") + num2str(logNprev) + string(";\n");
-		if(Nprev == 1) 
-			kernelString += string("    i = 0;\n"); 
-		else 
-			kernelString += string("    i = ii & ") + num2str(Nprev - 1) + string(";\n");
-	}
-
-    if(numXFormsPerWG > 1)
-        kernelString += string("    i = mad24(jj, ") + num2str(incr) + string(", i);\n");		
-
-    kernelString += string("    lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n"); 
-}
-
-static void
-insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad)
-{
-	if(numXFormsPerWG == 1) {
-		kernelString += string("    lMemStore = sMem + ii;\n");		
-	}
-	else {
-		kernelString += string("    lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n");	
-	}
-}
-
-
-static void
-createLocalMemfftKernelString(cl_fft_plan *plan)
-{
-	unsigned int radixArray[10];
-	unsigned int numRadix;
-	 
-	unsigned int n = plan->n.x;
-	
-	assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n");
-	
-	getRadixArray(n, radixArray, &numRadix, 0);
-	assert(numRadix > 0 && "no radix array supplied\n");
-	
-	if(n/radixArray[0] > plan->max_work_item_per_workgroup)
-	    getRadixArray(n, radixArray, &numRadix, plan->max_radix);
-
-	assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n");
-	assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n");
-	
-	unsigned int tmpLen = 1;
-	unsigned int i;
-	for(i = 0; i < numRadix; i++)
-	{	
-		assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) );
-	    tmpLen *= radixArray[i];
-	}
-	assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n");
-	
-	int offset, midPad;
-	string localString(""), kernelName("");
-	
-	clFFT_DataFormat dataFormat = plan->format;
-	string *kernelString = plan->kernel_string;
-	
-	
-	cl_fft_kernel_info **kInfo = &plan->kernel_info;
-	int kCount = 0;
-	
-	while(*kInfo)
-	{
-		kInfo = &(*kInfo)->next;
-		kCount++;
-	}
-	
-	kernelName = string("fft") + num2str(kCount);
-	
-	*kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info));
-	(*kInfo)->kernel = 0;
-	(*kInfo)->lmem_size = 0;
-	(*kInfo)->num_workgroups = 0;
-	(*kInfo)->num_workitems_per_workgroup = 0;
-	(*kInfo)->dir = cl_fft_kernel_x;
-	(*kInfo)->in_place_possible = 1;
-	(*kInfo)->next = NULL;
-	(*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
-	strcpy((*kInfo)->kernel_name, kernelName.c_str());
-	
-	unsigned int numWorkItemsPerXForm = n / radixArray[0];
-	unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm; 
-	assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup);
-	int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm;
-	(*kInfo)->num_workgroups = 1;
-    (*kInfo)->num_xforms_per_workgroup = numXFormsPerWG;
-	(*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG;
-	
-	unsigned int *N = radixArray;
-	unsigned int maxRadix = N[0];
-	unsigned int lMemSize = 0;
-		
-	insertVariables(localString, maxRadix);
-	
-	lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat);
-	(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
-	
-	string xcomp = string("x");
-	string ycomp = string("y");
-	
-	unsigned int Nprev = 1;
-	unsigned int len = n;
-	unsigned int r;
-	for(r = 0; r < numRadix; r++) 
-	{
-		int numIter = N[0] / N[r];
-		int numWorkItemsReq = n / N[r];
-		int Ncurr = Nprev * N[r];
-		insertfftKernel(localString, N[r], numIter);
-		
-		if(r < (numRadix - 1)) {
-			insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm);
-			lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad);
-			(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
-			insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], offset, midPad);
-			insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, offset, midPad);
-			insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp);
-			insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp);
-			insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp);
-			insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp);
-			Nprev = Ncurr;
-			len = len / N[r];
-		}
-	}
-	
-	lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat);
-	(*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
-	
-	insertHeader(*kernelString, kernelName, dataFormat);
-	*kernelString += string("{\n");
-	if((*kInfo)->lmem_size)
-        *kernelString += string("    __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
-	*kernelString += localString;
-	*kernelString += string("}\n");
-}
-
-// For n larger than what can be computed using local memory fft, global transposes
-// multiple kernel launces is needed. For these sizes, n can be decomposed using
-// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel
-// launches will be needed, first computing 64 x 32, length 128 ffts, second computing
-// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts. 
-// Each of these base radices can futher be divided into factors so that each of these 
-// base ffts can be computed within one kernel launch using in-register ffts and local 
-// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length 
-// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length 
-// 16 ffts followed by transpose using local memory followed by each of these eight 
-// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This 
-// means only 8 work items are needed for computing one length 128 fft. If we choose
-// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one
-// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this 
-// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each 
-// work group where each work group is computing 8 length 128 ffts where each length
-// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels
-// in this example. Users can play with difference base radices and difference 
-// decompositions of base radices to generates different kernels and see which gives
-// best performance. Following function is just fixed to use 128 as base radix
-
-void
-getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
-{
-	int baseRadix = min(n, 128);
-	
-	int numR = 0;
-	int N = n;
-	while(N > baseRadix) 
-	{
-		N /= baseRadix;
-		numR++;
-	}
-	
-	for(int i = 0; i < numR; i++)
-		radix[i] = baseRadix;
-	
-	radix[numR] = N;
-	numR++;
-	*numRadices = numR;
-		
-	for(int i = 0; i < numR; i++)
-	{
-		int B = radix[i];
-		if(B <= 8)
-		{
-			R1[i] = B;
-			R2[i] = 1;
-			continue;
-		}
-		
-		int r1 = 2; 
-		int r2 = B / r1;
-	    while(r2 > r1)
-	    {
-		   r1 *=2;
-		   r2 = B / r1;
-	    }
-		R1[i] = r1;
-		R2[i] = r2;
-	}	
-}
-
-static void
-createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
-{		
-	int i, j, k, t;
-	int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-    int R1Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-    int R2Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-	int radix, R1, R2;
-	int numRadices;
-	
-	int maxThreadsPerBlock = plan->max_work_item_per_workgroup;
-	int maxArrayLen = plan->max_radix;
-	int batchSize = plan->min_mem_coalesce_width;	
-	clFFT_DataFormat dataFormat = plan->format;
-	int vertical = (dir == cl_fft_kernel_x) ? 0 : 1;	
-	
-	getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices);
-		
-	int numPasses = numRadices;
-	
-	string localString(""), kernelName("");
-	string *kernelString = plan->kernel_string;
-	cl_fft_kernel_info **kInfo = &plan->kernel_info; 
-	int kCount = 0;
-	
-	while(*kInfo)
-	{
-		kInfo = &(*kInfo)->next;
-		kCount++;
-	}
-	
-	int N = n;
-	int m = (int)log2(n);
-	int Rinit = vertical ? BS : 1;
-	batchSize = vertical ? min(BS, batchSize) : batchSize;
-	int passNum;
-	
-	for(passNum = 0; passNum < numPasses; passNum++) 
-	{
-		
-		localString.clear();
-		kernelName.clear();
-		
-		radix = radixArr[passNum];
-		R1 = R1Arr[passNum];
-		R2 = R2Arr[passNum];
-				
-		int strideI = Rinit;
-		for(i = 0; i < numPasses; i++)
-			if(i != passNum)
-				strideI *= radixArr[i];
-		
-		int strideO = Rinit;
-		for(i = 0; i < passNum; i++)
-			strideO *= radixArr[i];
-		
-		int threadsPerXForm = R2;
-		batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize;
-		batchSize = min(batchSize, strideI);
-		int threadsPerBlock = batchSize * threadsPerXForm;
-		threadsPerBlock = min(threadsPerBlock, maxThreadsPerBlock);
-		batchSize = threadsPerBlock / threadsPerXForm;
-		assert(R2 <= R1);
-		assert(R1*R2 == radix);
-		assert(R1 <= maxArrayLen);
-		assert(threadsPerBlock <= maxThreadsPerBlock);
-		
-		int numIter = R1 / R2;
-		int gInInc = threadsPerBlock / batchSize;
-		
-		
-		int lgStrideO = log2(strideO);
-		int numBlocksPerXForm = strideI / batchSize;
-		int numBlocks = numBlocksPerXForm;
-		if(!vertical)
-			numBlocks *= BS;
-		else
-			numBlocks *= vertBS;
-		
-		kernelName = string("fft") + num2str(kCount);
-		*kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info));
-		(*kInfo)->kernel = 0;
-		if(R2 == 1)
-			(*kInfo)->lmem_size = 0;
-		else
-		{
-		    if(strideO == 1)
-		        (*kInfo)->lmem_size = (radix + 1)*batchSize;
-		    else
-			    (*kInfo)->lmem_size = threadsPerBlock*R1;
-		}
-		(*kInfo)->num_workgroups = numBlocks;
-        (*kInfo)->num_xforms_per_workgroup = 1;
-		(*kInfo)->num_workitems_per_workgroup = threadsPerBlock;
-		(*kInfo)->dir = dir;
-		if( (passNum == (numPasses - 1)) && (numPasses & 1) )
-		    (*kInfo)->in_place_possible = 1;
-		else
-			(*kInfo)->in_place_possible = 0;
-		(*kInfo)->next = NULL;
-		(*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
-		strcpy((*kInfo)->kernel_name, kernelName.c_str());
-		
-		insertVariables(localString, R1);
-						
-		if(vertical) 
-		{
-			localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n");
-			localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
-			localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n");
-			localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n");
-			localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
-			localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
-			int stride = radix*Rinit;
-			for(i = 0; i < passNum; i++)
-				stride *= radixArr[i];
-			localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n");
-			localString += string("bNum = groupId;\n");
-		}
-		else 
-		{
-			int lgNumBlocksPerXForm = log2(numBlocksPerXForm);
-			localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
-			localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n");
-			localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n");
-			localString += string("tid = indexIn;\n");
-			localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
-			localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); 
-			int stride = radix*Rinit;
-			for(i = 0; i < passNum; i++)
-				stride *= radixArr[i];
-			localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n");			
-			localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
-			localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");	
-		}
-		
-		// Load Data
-		int lgBatchSize = log2(batchSize);
-		localString += string("tid = lId;\n");
-		localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n");
-		localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); 
-		localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n");
-
-		if(dataFormat == clFFT_SplitComplexFormat) 
-		{
-			localString += string("in_real += indexIn;\n");
-			localString += string("in_imag += indexIn;\n");			
-			for(j = 0; j < R1; j++)
-				localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n");
-			for(j = 0; j < R1; j++) 
-				localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n");
-		}
-		else 
-		{
-			localString += string("in += indexIn;\n");
-			for(j = 0; j < R1; j++)
-				localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n");
-	    }
-		
-		localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");							  
-		
-		if(R2 > 1)
-		{
-		    // twiddle
-		    for(k = 1; k < R1; k++) 
-		    {
-			    localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n");
-			    localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
-			    localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); 
-		    }
-		
-		    // shuffle
-		    numIter = R1 / R2;	
-		    localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n");
-		    localString += string("lMemStore = sMem + tid;\n");
-		    localString += string("lMemLoad = sMem + indexIn;\n");
-		    for(k = 0; k < R1; k++) 
-			    localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
-		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");	
-		    for(k = 0; k < numIter; k++)
-			    for(t = 0; t < R2; t++)
-				    localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
-		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
-		    for(k = 0; k < R1; k++) 
-			    localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
-		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");	
-		    for(k = 0; k < numIter; k++)
-			    for(t = 0; t < R2; t++)
-				    localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
-		    localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
-		
-		    for(j = 0; j < numIter; j++)
-			    localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n");
-		}
-		
-		// twiddle
-		if(passNum < (numPasses - 1)) 
-		{
-			localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n");
-			localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n"); 
-			localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n");
-			for(t = 0; t < R1; t++) 
-			{
-				localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n");
-				localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
-				localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n");
-			}
-		}
-		
-		// Store Data
-		if(strideO == 1) 
-		{
-			
-			localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n");
-			localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n");
-			
-			for(int i = 0; i < R1/R2; i++)
-				for(int j = 0; j < R2; j++)
-					localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n");
-			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
-			if(threadsPerBlock >= radix)
-            {
-                for(int i = 0; i < R1; i++)
-                localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
-            }
-            else
-            {
-                int innerIter = radix/threadsPerBlock;
-                int outerIter = R1/innerIter;
-                for(int i = 0; i < outerIter; i++)
-                    for(int j = 0; j < innerIter; j++)
-                        localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
-            }
-			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
-			
-			for(int i = 0; i < R1/R2; i++)
-				for(int j = 0; j < R2; j++)
-					localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n");
-			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
-			if(threadsPerBlock >= radix)
-            {
-                for(int i = 0; i < R1; i++)
-                    localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
-            }
-            else
-            {
-                int innerIter = radix/threadsPerBlock;
-                int outerIter = R1/innerIter;
-                for(int i = 0; i < outerIter; i++)
-                    for(int j = 0; j < innerIter; j++)
-                        localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
-            }
-			localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
-			
-			localString += string("indexOut += tid;\n");
-			if(dataFormat == clFFT_SplitComplexFormat) {
-				localString += string("out_real += indexOut;\n");
-				localString += string("out_imag += indexOut;\n");
-				for(k = 0; k < R1; k++)
-					localString += string("out_real[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
-				for(k = 0; k < R1; k++)
-					localString += string("out_imag[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
-			}
-			else {
-				localString += string("out += indexOut;\n");
-				for(k = 0; k < R1; k++)
-					localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n");				
-			}
-		 
-		}
-		else 
-		{
-			localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n");
-			if(dataFormat == clFFT_SplitComplexFormat) {
-				localString += string("out_real += indexOut;\n");
-				localString += string("out_imag += indexOut;\n");			
-				for(k = 0; k < R1; k++)
-					localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n");
-				for(k = 0; k < R1; k++)
-					localString += string("out_imag[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].y;\n");
-			}
-			else {
-				localString += string("out += indexOut;\n");
-				for(k = 0; k < R1; k++)
-					localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n");
-			}
-		}
-		
-		insertHeader(*kernelString, kernelName, dataFormat);
-		*kernelString += string("{\n");
-		if((*kInfo)->lmem_size)
-			*kernelString += string("    __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
-		*kernelString += localString;
-		*kernelString += string("}\n");		
-		
-		N /= radix;
-		kInfo = &(*kInfo)->next;
-		kCount++;
-	}
-}
-
-void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
-{	
-    unsigned int radixArray[10];
-    unsigned int numRadix;
-    
-	switch(dir)
-	{
-		case cl_fft_kernel_x:
-		    if(plan->n.x > plan->max_localmem_fft_size)
-		    {
-		        createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
-		    }
-		    else if(plan->n.x > 1)
-		    {
-		        getRadixArray(plan->n.x, radixArray, &numRadix, 0);
-		        if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
-		        {
-				    createLocalMemfftKernelString(plan);
-				}
-			    else
-			    {
-			        getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix);
-			        if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
-			            createLocalMemfftKernelString(plan);
-			        else
-				        createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
-				}
-		    }
-			break;
-			
-		case cl_fft_kernel_y:
-			if(plan->n.y > 1)
-			    createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1);
-			break;
-			
-		case cl_fft_kernel_z:
-			if(plan->n.z > 1)
-			    createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1);
-		default:
-			return;
-	}
-}
-
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/fft_setup.cpp b/RTCP/Cobalt/OpenCL_FFT/src/fft_setup.cpp
deleted file mode 100644
index f2c0f03957b3ce0e2efb9e4b6feea6c21f9003a1..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/fft_setup.cpp
+++ /dev/null
@@ -1,403 +0,0 @@
-
-//
-// File:       fft_setup.cpp
-//
-// Version:    <1.0>
-//
-// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
-//             in consideration of your agreement to the following terms, and your use,
-//             installation, modification or redistribution of this Apple software
-//             constitutes acceptance of these terms.  If you do not agree with these
-//             terms, please do not use, install, modify or redistribute this Apple
-//             software.
-//
-//             In consideration of your agreement to abide by the following terms, and
-//             subject to these terms, Apple grants you a personal, non - exclusive
-//             license, under Apple's copyrights in this original Apple software ( the
-//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
-//             Software, with or without modifications, in source and / or binary forms;
-//             provided that if you redistribute the Apple Software in its entirety and
-//             without modifications, you must retain this notice and the following text
-//             and disclaimers in all such redistributions of the Apple Software. Neither
-//             the name, trademarks, service marks or logos of Apple Inc. may be used to
-//             endorse or promote products derived from the Apple Software without specific
-//             prior written permission from Apple.  Except as expressly stated in this
-//             notice, no other rights or licenses, express or implied, are granted by
-//             Apple herein, including but not limited to any patent rights that may be
-//             infringed by your derivative works or by other works in which the Apple
-//             Software may be incorporated.
-//
-//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
-//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
-//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
-//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
-//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
-//
-//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
-//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
-//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
-//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
-//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-#include "fft_internal.h"
-#include "fft_base_kernels.h"
-#include <climits>
-#include <cstdlib>
-#include <cstring>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <iostream>
-#include <string>
-#include <sstream>
-
-using namespace std;
-
-extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems);
-
-static void 
-getBlockConfigAndKernelString(cl_fft_plan *plan)
-{
-	plan->temp_buffer_needed = 0;
-	*plan->kernel_string += baseKernels;
-	
-	if(plan->format == clFFT_SplitComplexFormat)
-		*plan->kernel_string += twistKernelPlannar;
-	else
-		*plan->kernel_string += twistKernelInterleaved;
-	
-	switch(plan->dim) 
-	{
-		case clFFT_1D:
-			FFT1D(plan, cl_fft_kernel_x);
-			break;
-			
-		case clFFT_2D:
-			FFT1D(plan, cl_fft_kernel_x); 
-			FFT1D(plan, cl_fft_kernel_y);  
-			break;
-			
-		case clFFT_3D:
-			FFT1D(plan, cl_fft_kernel_x); 
-			FFT1D(plan, cl_fft_kernel_y); 
-			FFT1D(plan, cl_fft_kernel_z); 
-			break;
-			
-		default:
-			return;
-	}
-	
-	plan->temp_buffer_needed = 0;
-	cl_fft_kernel_info *kInfo = plan->kernel_info;
-	while(kInfo)
-	{
-		plan->temp_buffer_needed |= !kInfo->in_place_possible;
-		kInfo = kInfo->next;
-	}
-}
-
- 
-static void
-deleteKernelInfo(cl_fft_kernel_info *kInfo)
-{
-	if(kInfo)
-	{
-	    if(kInfo->kernel_name)
-		    free(kInfo->kernel_name);
-	    if(kInfo->kernel)
-		    clReleaseKernel(kInfo->kernel);
-		free(kInfo);
-	}	
-}
-
-static void
-destroy_plan(cl_fft_plan *Plan)
-{
-    cl_fft_kernel_info *kernel_info = Plan->kernel_info;
-
-	while(kernel_info)
-	{
-		cl_fft_kernel_info *tmp = kernel_info->next;
-		deleteKernelInfo(kernel_info);
-		kernel_info = tmp;
-	}
-	
-	Plan->kernel_info = NULL;
-		
-	if(Plan->kernel_string)
-	{
-		delete Plan->kernel_string;
-		Plan->kernel_string = NULL;
-	}			
-	if(Plan->twist_kernel)
-	{
-		clReleaseKernel(Plan->twist_kernel);
-		Plan->twist_kernel = NULL;
-	}
-	if(Plan->program)
-	{
-		clReleaseProgram(Plan->program);
-		Plan->program = NULL;
-	}
-	if(Plan->tempmemobj) 
-	{
-		clReleaseMemObject(Plan->tempmemobj);
-		Plan->tempmemobj = NULL;
-	}
-	if(Plan->tempmemobj_real)
-	{
-		clReleaseMemObject(Plan->tempmemobj_real);
-		Plan->tempmemobj_real = NULL;
-	}
-	if(Plan->tempmemobj_imag)
-	{
-		clReleaseMemObject(Plan->tempmemobj_imag);
-		Plan->tempmemobj_imag = NULL;
-	}
-}
-
-static int
-createKernelList(cl_fft_plan *plan) 
-{
-	cl_program program = plan->program;
-	cl_fft_kernel_info *kernel_info = plan->kernel_info;
-	
-	cl_int err;
-	while(kernel_info)
-	{
-		kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
-		if(!kernel_info->kernel || err != CL_SUCCESS)
-			return err;
-		kernel_info = kernel_info->next;		
-	}
-	
-	if(plan->format == clFFT_SplitComplexFormat)
-		plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
-	else
-		plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
-	
-	if(!plan->twist_kernel || err)
-		return err;
-
-	return CL_SUCCESS;
-}
-
-int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices)
-{	
-    int reg_needed = 0;
-    *max_wg_size = INT_MAX;
-    int err;
-    size_t wg_size;
-    
-    unsigned int i;
-    for(i = 0; i < num_devices; i++)
-    {
-	    cl_fft_kernel_info *kInfo = plan->kernel_info;
-	    while(kInfo)
-	    {
-		    err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
-		    if(err != CL_SUCCESS)
-		        return -1;
-		        
-		    if(wg_size < kInfo->num_workitems_per_workgroup)
-		        reg_needed |= 1;
-		    
-		    if(*max_wg_size > wg_size)
-		        *max_wg_size = wg_size;
-		        
-		    kInfo = kInfo->next;
-	    }
-	}
-	
-	return reg_needed;
-}	
-
-#define ERR_MACRO(err) { \
-                         if( err != CL_SUCCESS) \
-                         { \
-                           if(error_code) \
-                               *error_code = err; \
-                           clFFT_DestroyPlan((clFFT_Plan) plan); \
-						   return (clFFT_Plan) NULL; \
-                         } \
-					   }
-
-clFFT_Plan
-clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code )
-{
-	int i;
-	cl_int err;
-	int isPow2 = 1;
-	cl_fft_plan *plan = NULL;
-	ostringstream kString;
-	int num_devices;
-	int gpu_found = 0;
-	cl_device_id devices[16];
-	size_t ret_size;
-	cl_device_type device_type;
-	
-    if(!context)
-		ERR_MACRO(CL_INVALID_VALUE);
-	
-	isPow2 |= n.x && !( (n.x - 1) & n.x );
-	isPow2 |= n.y && !( (n.y - 1) & n.y );
-	isPow2 |= n.z && !( (n.z - 1) & n.z );
-	
-	if(!isPow2)
-		ERR_MACRO(CL_INVALID_VALUE);
-	
-	if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) )
-		ERR_MACRO(CL_INVALID_VALUE);
-
-	plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan));
-	if(!plan)
-		ERR_MACRO(CL_OUT_OF_RESOURCES);
-	
-	plan->context = context;
-	clRetainContext(context);
-	plan->n = n;
-	plan->dim = dim;
-	plan->format = dataFormat;
-	plan->kernel_info = 0;
-	plan->num_kernels = 0;
-	plan->twist_kernel = 0;
-	plan->program = 0;
-	plan->temp_buffer_needed = 0;
-	plan->last_batch_size = 0;
-	plan->tempmemobj = 0;
-	plan->tempmemobj_real = 0;
-	plan->tempmemobj_imag = 0;
-	plan->max_localmem_fft_size = 2048;
-	plan->max_work_item_per_workgroup = 256;
-	plan->max_radix = 16;
-	plan->min_mem_coalesce_width = 16;
-	plan->num_local_mem_banks = 16;	
-	
-patch_kernel_source:
-
-	plan->kernel_string = new string("");
-	if(!plan->kernel_string)
-        ERR_MACRO(CL_OUT_OF_RESOURCES);
-
-	getBlockConfigAndKernelString(plan);
-	
-	const char *source_str = plan->kernel_string->c_str();
-	plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err);
-    ERR_MACRO(err);
-
-	err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size);
-	ERR_MACRO(err);
-	
-	num_devices = ret_size / sizeof(cl_device_id);
-	cl_int build_err = clBuildProgram(plan->program, num_devices, devices, "-cl-mad-enable", NULL, NULL);
-	
-	for(i = 0; i < num_devices; i++)
-	{
-		err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL);
-		ERR_MACRO(err);
-		
-		if(device_type == CL_DEVICE_TYPE_CPU || device_type == CL_DEVICE_TYPE_GPU || CL_DEVICE_TYPE_ACCELERATOR)
-		{	
-			gpu_found = 1;
-	        //err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL);
-	        if (build_err != CL_SUCCESS)
-	        {
-		        char *build_log;				
-				char devicename[200];
-		        size_t log_size;
-				
-		        err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-				ERR_MACRO(err);
-				
-		        build_log = (char *) malloc(log_size + 1);
-				
-			    err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
-				ERR_MACRO(err);
-				
-				err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL);
-				ERR_MACRO(err);
-				
-				fprintf(stdout, "FFT program build log on device %s\n", devicename);
-		        fprintf(stdout, "%s\n", build_log);
-		        free(build_log);
-				
-				ERR_MACRO(err);
-			}	
-		}	
-	}
-	
-	if(!gpu_found)
-		ERR_MACRO(CL_INVALID_CONTEXT);
-	
-	err = createKernelList(plan); 
-    ERR_MACRO(err);
-    
-    // we created program and kernels based on "some max work group size (default 256)" ... this work group size
-    // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source 
-    // setting this as limit i.e max group size and rebuild. 
-	unsigned int max_kernel_wg_size; 
-	int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices);
-	if(patching_req == -1)
-	{
-	    ERR_MACRO(err);
-	}
-	
-	if(patching_req)
-	{
-	    destroy_plan(plan);
-	    plan->max_work_item_per_workgroup = max_kernel_wg_size;
-	    goto patch_kernel_source;
-	}
-	
-	cl_fft_kernel_info *kInfo = plan->kernel_info;
-	while(kInfo)
-	{
-		plan->num_kernels++;
-		kInfo = kInfo->next;
-	}
-	
-	if(error_code)
-		*error_code = CL_SUCCESS;
-			
-	return (clFFT_Plan) plan;
-}
-
-void		 
-clFFT_DestroyPlan(clFFT_Plan plan)
-{
-    cl_fft_plan *Plan = (cl_fft_plan *) plan;
-	if(Plan) 
-	{	
-		destroy_plan(Plan);	
-		clReleaseContext(Plan->context);
-		free(Plan);
-	}		
-}
-
-void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file)
-{
-	size_t gDim, lDim;
-	FILE *out;
-	if(!file)
-		out = stdout;
-	else 
-		out = file;
-	
-	cl_fft_plan *plan = (cl_fft_plan *) Plan;
-	cl_fft_kernel_info *kInfo = plan->kernel_info;
-	
-	while(kInfo)
-	{
-		cl_int s = 1;
-		getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim);
-		fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim);
-		kInfo = kInfo->next;
-	}
-	fprintf(out, "%s\n", plan->kernel_string->c_str());
-}
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/libOpenCL_FFT.a.not b/RTCP/Cobalt/OpenCL_FFT/src/libOpenCL_FFT.a.not
deleted file mode 100644
index 595b15d060a75485d1668d9fd6271c350eec7afb..0000000000000000000000000000000000000000
Binary files a/RTCP/Cobalt/OpenCL_FFT/src/libOpenCL_FFT.a.not and /dev/null differ
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/main.cpp b/RTCP/Cobalt/OpenCL_FFT/src/main.cpp
deleted file mode 100755
index bc6b956137faedcb2a58c25e1d81d3aedd8a4568..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/main.cpp
+++ /dev/null
@@ -1,882 +0,0 @@
-
-//
-// File:       main.cpp
-//
-// Version:    <1.0>
-//
-// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
-//             in consideration of your agreement to the following terms, and your use,
-//             installation, modification or redistribution of this Apple software
-//             constitutes acceptance of these terms.  If you do not agree with these
-//             terms, please do not use, install, modify or redistribute this Apple
-//             software.
-//
-//             In consideration of your agreement to abide by the following terms, and
-//             subject to these terms, Apple grants you a personal, non - exclusive
-//             license, under Apple's copyrights in this original Apple software ( the
-//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
-//             Software, with or without modifications, in source and / or binary forms;
-//             provided that if you redistribute the Apple Software in its entirety and
-//             without modifications, you must retain this notice and the following text
-//             and disclaimers in all such redistributions of the Apple Software. Neither
-//             the name, trademarks, service marks or logos of Apple Inc. may be used to
-//             endorse or promote products derived from the Apple Software without specific
-//             prior written permission from Apple.  Except as expressly stated in this
-//             notice, no other rights or licenses, express or implied, are granted by
-//             Apple herein, including but not limited to any patent rights that may be
-//             infringed by your derivative works or by other works in which the Apple
-//             Software may be incorporated.
-//
-//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
-//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
-//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
-//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
-//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
-//
-//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
-//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
-//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
-//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
-//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-#include <string.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <CL/cl.h>
-#include "clFFT.h"
-//#include <mach/mach_time.h>
-#include <Accelerate/Accelerate.h>
-#include "procs.h"
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <stdint.h>
-#include <float.h>
-
-#define eps_avg 10.0
-
-#define MAX( _a, _b)	((_a)>(_b)?(_a) : (_b))
-
-typedef enum {
-	clFFT_OUT_OF_PLACE,
-	clFFT_IN_PLACE,
-}clFFT_TestType;
-
-typedef struct
-{
-	double real;
-	double imag;
-}clFFT_ComplexDouble;
-
-typedef struct
-{
-	double *real;
-	double *imag;
-}clFFT_SplitComplexDouble;
-
-cl_device_id     device_id;
-cl_context       context;
-cl_command_queue queue;
-
-typedef unsigned long long ulong;
-
-double subtractTimes( uint64_t endTime, uint64_t startTime )
-{
-    uint64_t difference = endTime - startTime;
-    static double conversion = 0.0;
-    
-    if( conversion == 0.0 )
-    {
-        mach_timebase_info_data_t info;
-        kern_return_t err = mach_timebase_info( &info );
-        
-		//Convert the timebase into seconds
-        if( err == 0  )
-			conversion = 1e-9 * (double) info.numer / (double) info.denom;
-    }
-    
-    return conversion * (double) difference;
-}
-
-void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n, 
-					  unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir)
-{
-	FFTSetup plan_vdsp;
-	DSPSplitComplex out_vdsp;
-	FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE;
-	
-	unsigned int i, j, k;
-	unsigned int stride;
-	unsigned int log2Nx = (unsigned int) log2(n.x);
-	unsigned int log2Ny = (unsigned int) log2(n.y);
-	unsigned int log2Nz = (unsigned int) log2(n.z);
-	unsigned int log2N;
-	
-	log2N = log2Nx;
-	log2N = log2N > log2Ny ? log2N : log2Ny;
-	log2N = log2N > log2Nz ? log2N : log2Nz;
-	
-	plan_vdsp = vDSP_create_fftsetup(log2N, 2);
-	
-	switch(dim)
-	{
-		case clFFT_1D:
-			
-			for(i = 0; i < batchSize; i++)
-			{
-				stride = i * n.x;
-				out_vdsp.realp  = out->real  + stride;
-				out_vdsp.imagp  = out->imag  + stride;
-				
-			    vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
-			}
-			break;
-			
-		case clFFT_2D:
-			
-			for(i = 0; i < batchSize; i++)
-			{
-				for(j = 0; j < n.y; j++)
-				{
-					stride = j * n.x + i * n.x * n.y;
-					out_vdsp.realp = out->real + stride;
-					out_vdsp.imagp = out->imag + stride;
-					
-					vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
-				}
-			}
-			for(i = 0; i < batchSize; i++)
-			{
-				for(j = 0; j < n.x; j++)
-				{
-					stride = j + i * n.x  * n.y;
-					out_vdsp.realp = out->real + stride;
-					out_vdsp.imagp = out->imag + stride;
-					
-					vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
-				}
-			}
-			break;
-			
-		case clFFT_3D:
-			
-			for(i = 0; i < batchSize; i++)
-			{
-				for(j = 0; j < n.z; j++)
-				{
-					for(k = 0; k < n.y; k++)
-					{
-						stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z;
-						out_vdsp.realp = out->real + stride;
-						out_vdsp.imagp = out->imag + stride;
-						
-						vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
-					}
-				}
-			}
-			for(i = 0; i < batchSize; i++)
-			{
-				for(j = 0; j < n.z; j++)
-				{
-					for(k = 0; k < n.x; k++)
-					{
-						stride = k + j * n.x * n.y + i * n.x * n.y * n.z;
-						out_vdsp.realp = out->real + stride;
-						out_vdsp.imagp = out->imag + stride;
-						
-						vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
-					}
-				}
-			}
-			for(i = 0; i < batchSize; i++)
-			{
-				for(j = 0; j < n.y; j++)
-				{
-					for(k = 0; k < n.x; k++)
-					{
-						stride = k + j * n.x + i * n.x * n.y * n.z;
-						out_vdsp.realp = out->real + stride;
-						out_vdsp.imagp = out->imag + stride;
-						
-						vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp);
-					}
-				}
-			}
-			break;
-	}
-	
-	vDSP_destroy_fftsetup(plan_vdsp);
-}
-
-void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n, 
-					  unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir)
-{
-	FFTSetupD plan_vdsp;
-	DSPDoubleSplitComplex out_vdsp;
-	FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE;
-	
-	unsigned int i, j, k;
-	unsigned int stride;
-	unsigned int log2Nx = (int) log2(n.x);
-	unsigned int log2Ny = (int) log2(n.y);
-	unsigned int log2Nz = (int) log2(n.z);
-	unsigned int log2N;
-	
-	log2N = log2Nx;
-	log2N = log2N > log2Ny ? log2N : log2Ny;
-	log2N = log2N > log2Nz ? log2N : log2Nz;
-	
-	plan_vdsp = vDSP_create_fftsetupD(log2N, 2);
-	
-	switch(dim)
-	{
-		case clFFT_1D:
-			
-			for(i = 0; i < batchSize; i++)
-			{
-				stride = i * n.x;
-				out_vdsp.realp  = out->real  + stride;
-				out_vdsp.imagp  = out->imag  + stride;
-				
-			    vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
-			}
-			break;
-			
-		case clFFT_2D:
-			
-			for(i = 0; i < batchSize; i++)
-			{
-				for(j = 0; j < n.y; j++)
-				{
-					stride = j * n.x + i * n.x * n.y;
-					out_vdsp.realp = out->real + stride;
-					out_vdsp.imagp = out->imag + stride;
-					
-					vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
-				}
-			}
-			for(i = 0; i < batchSize; i++)
-			{
-				for(j = 0; j < n.x; j++)
-				{
-					stride = j + i * n.x  * n.y;
-					out_vdsp.realp = out->real + stride;
-					out_vdsp.imagp = out->imag + stride;
-					
-					vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
-				}
-			}
-			break;
-			
-		case clFFT_3D:
-			
-			for(i = 0; i < batchSize; i++)
-			{
-				for(j = 0; j < n.z; j++)
-				{
-					for(k = 0; k < n.y; k++)
-					{
-						stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z;
-						out_vdsp.realp = out->real + stride;
-						out_vdsp.imagp = out->imag + stride;
-						
-						vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
-					}
-				}
-			}
-			for(i = 0; i < batchSize; i++)
-			{
-				for(j = 0; j < n.z; j++)
-				{
-					for(k = 0; k < n.x; k++)
-					{
-						stride = k + j * n.x * n.y + i * n.x * n.y * n.z;
-						out_vdsp.realp = out->real + stride;
-						out_vdsp.imagp = out->imag + stride;
-						
-						vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
-					}
-				}
-			}
-			for(i = 0; i < batchSize; i++)
-			{
-				for(j = 0; j < n.y; j++)
-				{
-					for(k = 0; k < n.x; k++)
-					{
-						stride = k + j * n.x + i * n.x * n.y * n.z;
-						out_vdsp.realp = out->real + stride;
-						out_vdsp.imagp = out->imag + stride;
-						
-						vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp);
-					}
-				}
-			}
-			break;
-	}
-	
-	vDSP_destroy_fftsetupD(plan_vdsp);
-}
-
-double complexNormSq(clFFT_ComplexDouble a)
-{
-	return (a.real * a.real + a.imag * a.imag);
-}
-
-double computeL2Error(clFFT_SplitComplex *data, clFFT_SplitComplexDouble *data_ref, int n, int batchSize, double *max_diff, double *min_diff)
-{
-	int i, j;
-	double avg_norm = 0.0;
-	*max_diff = 0.0;
-	*min_diff = 0x1.0p1000;
-	
-	for(j = 0; j < batchSize; j++)
-	{
-		double norm_ref = 0.0;
-		double norm = 0.0;
-	    for(i = 0; i < n; i++) 
-		{
-			int index = j * n + i;
-		    clFFT_ComplexDouble diff = (clFFT_ComplexDouble) { data_ref->real[index] - data->real[index], data_ref->imag[index] - data->imag[index] };
-		    double norm_tmp = complexNormSq(diff);
-		    norm += norm_tmp;
-		    norm_ref += (data_ref->real[index] * data_ref->real[index] + data_ref->imag[index] * data_ref->imag[index]);
-	    }
-	    double curr_norm = sqrt( norm / norm_ref ) / FLT_EPSILON;
-		avg_norm += curr_norm;
-		*max_diff = *max_diff < curr_norm ? curr_norm : *max_diff;
-		*min_diff = *min_diff > curr_norm ? curr_norm : *min_diff;
-	}
-	
-	return avg_norm / batchSize;
-}
-
-void convertInterleavedToSplit(clFFT_SplitComplex *result_split, clFFT_Complex *data_cl, int length)
-{
-	int i;
-	for(i = 0; i < length; i++) {
-		result_split->real[i] = data_cl[i].real;
-		result_split->imag[i] = data_cl[i].imag;
-	}
-}
-
-int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim, 
-			clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType)
-{	
-	cl_int err = CL_SUCCESS;
-	int iter;
-	double t;
-	
-	uint64_t t0, t1;
-	int mx = log2(n.x);
-	int my = log2(n.y);
-	int mz = log2(n.z);
-
-	int length = n.x * n.y * n.z * batchSize;
-		
-	double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter;
-	
-	clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL };
-	clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL };
-	clFFT_Complex *data_i = NULL;
-	clFFT_Complex *data_cl = NULL;
-	clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL }; 
-	clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL };
-	
-	clFFT_Plan plan = NULL;
-	cl_mem data_in = NULL;
-	cl_mem data_out = NULL;
-	cl_mem data_in_real = NULL;
-	cl_mem data_in_imag = NULL;
-	cl_mem data_out_real = NULL;
-	cl_mem data_out_imag = NULL;
-	
-	if(dataFormat == clFFT_SplitComplexFormat) {
-		data_i_split.real     = (float *) malloc(sizeof(float) * length);
-		data_i_split.imag     = (float *) malloc(sizeof(float) * length);
-		data_cl_split.real    = (float *) malloc(sizeof(float) * length);
-		data_cl_split.imag    = (float *) malloc(sizeof(float) * length);
-		if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag)
-		{
-			err = -1;
-			log_error("Out-of-Resources\n");
-			goto cleanup;
-		}
-	}
-	else {
-		data_i  = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
-		data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
-		if(!data_i || !data_cl)
-		{
-			err = -2;
-			log_error("Out-of-Resouces\n");
-			goto cleanup;
-		}
-	}
-	
-	data_iref.real   = (double *) malloc(sizeof(double) * length);
-	data_iref.imag   = (double *) malloc(sizeof(double) * length);
-	data_oref.real   = (double *) malloc(sizeof(double) * length);
-	data_oref.imag   = (double *) malloc(sizeof(double) * length);	
-	if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag)
-	{
-		err = -3;
-		log_error("Out-of-Resources\n");
-		goto cleanup;
-	}
-
-	int i;
-	if(dataFormat == clFFT_SplitComplexFormat) {
-		for(i = 0; i < length; i++)
-		{
-			data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
-			data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
-			data_cl_split.real[i] = 0.0f;
-			data_cl_split.imag[i] = 0.0f;			
-			data_iref.real[i] = data_i_split.real[i];
-			data_iref.imag[i] = data_i_split.imag[i];
-			data_oref.real[i] = data_iref.real[i];
-			data_oref.imag[i] = data_iref.imag[i];	
-		}
-	}
-	else {
-		for(i = 0; i < length; i++)
-		{
-			data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
-			data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
-			data_cl[i].real = 0.0f;
-			data_cl[i].imag = 0.0f;			
-			data_iref.real[i] = data_i[i].real;
-			data_iref.imag[i] = data_i[i].imag;
-			data_oref.real[i] = data_iref.real[i];
-			data_oref.imag[i] = data_iref.imag[i];	
-		}		
-	}
-	
-	plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err );
-	if(!plan || err) 
-	{
-		log_error("clFFT_CreatePlan failed\n");
-		goto cleanup;
-	}
-	
-	//clFFT_DumpPlan(plan, stdout);
-	
-	if(dataFormat == clFFT_SplitComplexFormat)
-	{
-		data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err);
-	    if(!data_in_real || err) 
-	    {
-			log_error("clCreateBuffer failed\n");
-			goto cleanup;
-	    }
-		
-		data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err);
-	    if(!data_in_imag || err) 
-	    {
-			log_error("clCreateBuffer failed\n");
-			goto cleanup;
-	    }
-		
-		if(testType == clFFT_OUT_OF_PLACE)
-		{
-			data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err);
-			if(!data_out_real || err) 
-			{
-				log_error("clCreateBuffer failed\n");
-				goto cleanup;
-			}
-			
-			data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err);
-			if(!data_out_imag || err) 
-			{
-				log_error("clCreateBuffer failed\n");
-				goto cleanup;
-			}			
-		}
-		else
-		{
-			data_out_real = data_in_real;
-			data_out_imag = data_in_imag;
-		}
-	}
-	else
-	{
-	    data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err);
-	    if(!data_in) 
-	    {
-			log_error("clCreateBuffer failed\n");
-			goto cleanup;
-	    }
-		if(testType == clFFT_OUT_OF_PLACE)
-		{
-			data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err);
-			if(!data_out) 
-			{
-				log_error("clCreateBuffer failed\n");
-				goto cleanup;
-			}			
-		}
-		else
-			data_out = data_in;
-	}
-		
-			
-	err = CL_SUCCESS;
-	
-	t0 = mach_absolute_time();
-	if(dataFormat == clFFT_SplitComplexFormat)
-	{
-		for(iter = 0; iter < numIter; iter++)
-		    err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL);
-	}
-	else
-	{
-	    for(iter = 0; iter < numIter; iter++) 
-			err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL);
-	}
-	
-	err |= clFinish(queue);
-	
-	if(err) 
-	{
-		log_error("clFFT_Execute\n");
-		goto cleanup;	
-	}
-	
-	t1 = mach_absolute_time(); 
-	t = subtractTimes(t1, t0);
-	char temp[100];
-	sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize);
-	log_perf(gflops / (float) t, 1, "GFlops/s", "%s", temp);
-
-	if(dataFormat == clFFT_SplitComplexFormat)
-	{	
-		err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL);
-		err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL);
-	}
-	else
-	{
-		err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL);
-	}
-	
-	if(err) 
-	{
-		log_error("clEnqueueReadBuffer failed\n");
-        goto cleanup;
-	}	
-
-	computeReferenceD(&data_oref, n, batchSize, dim, dir);
-	
-	double diff_avg, diff_max, diff_min;
-	if(dataFormat == clFFT_SplitComplexFormat) {
-		diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min);
-		if(diff_avg > eps_avg)
-			log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
-		else
-			log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);			
-	}
-	else {
-		clFFT_SplitComplex result_split;
-		result_split.real = (float *) malloc(length*sizeof(float));
-		result_split.imag = (float *) malloc(length*sizeof(float));
-		convertInterleavedToSplit(&result_split, data_cl, length);
-		diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min);
-		
-		if(diff_avg > eps_avg)
-			log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
-		else
-			log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);	
-		free(result_split.real);
-		free(result_split.imag);
-	}
-	
-cleanup:
-	clFFT_DestroyPlan(plan);	
-	if(dataFormat == clFFT_SplitComplexFormat) 
-	{
-		if(data_i_split.real)
-			free(data_i_split.real);
-		if(data_i_split.imag)
-			free(data_i_split.imag);
-		if(data_cl_split.real)
-			free(data_cl_split.real);
-		if(data_cl_split.imag)
-			free(data_cl_split.imag);
-		
-		if(data_in_real)
-			clReleaseMemObject(data_in_real);
-		if(data_in_imag)
-			clReleaseMemObject(data_in_imag);
-		if(data_out_real && testType == clFFT_OUT_OF_PLACE)
-			clReleaseMemObject(data_out_real);
-		if(data_out_imag && clFFT_OUT_OF_PLACE)
-			clReleaseMemObject(data_out_imag);
-	}
-	else 
-	{
-		if(data_i)
-			free(data_i);
-		if(data_cl)
-			free(data_cl);
-		
-		if(data_in)
-			clReleaseMemObject(data_in);
-		if(data_out && testType == clFFT_OUT_OF_PLACE)
-			clReleaseMemObject(data_out);
-	}
-	
-	if(data_iref.real)
-		free(data_iref.real);
-	if(data_iref.imag)
-		free(data_iref.imag);		
-	if(data_oref.real)
-		free(data_oref.real);
-	if(data_oref.imag)
-		free(data_oref.imag);
-	
-	return err;
-}
-
-bool ifLineCommented(const char *line) {
-	const char *Line = line;
-	while(*Line != '\0')
-		if((*Line == '/') && (*(Line + 1) == '/'))
-			return true;
-		else
-			Line++;
-	return false;
-}
-
-cl_device_type getGlobalDeviceType()
-{
-	char *force_cpu = getenv( "CL_DEVICE_TYPE" );
-	if( force_cpu != NULL )
-	{
-		if( strcmp( force_cpu, "gpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_GPU" ) == 0 )
-			return CL_DEVICE_TYPE_GPU;
-		else if( strcmp( force_cpu, "cpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_CPU" ) == 0 )
-			return CL_DEVICE_TYPE_CPU;
-		else if( strcmp( force_cpu, "accelerator" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_ACCELERATOR" ) == 0 )
-			return CL_DEVICE_TYPE_ACCELERATOR;
-		else if( strcmp( force_cpu, "CL_DEVICE_TYPE_DEFAULT" ) == 0 )
-			return CL_DEVICE_TYPE_DEFAULT;
-	}
-	// default
-	return CL_DEVICE_TYPE_GPU;
-}
-
-void 
-notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data)
-{
-    log_error( "%s\n", errinfo );
-}
-
-int
-checkMemRequirements(clFFT_Dim3 n, int batchSize, clFFT_TestType testType, cl_ulong gMemSize)
-{
-	cl_ulong memReq = (testType == clFFT_OUT_OF_PLACE) ? 3 : 2;
-	memReq *= n.x*n.y*n.z*sizeof(clFFT_Complex)*batchSize;
-	memReq = memReq/1024/1024;
-	if(memReq >= gMemSize)
-		return -1;
-	return 0;
-}
-
-int main (int argc, char * const argv[]) {
-	
-	test_start();
-	
-	cl_ulong gMemSize;
-	clFFT_Direction dir = clFFT_Forward;
-	int numIter = 1;
-	clFFT_Dim3 n = { 1024, 1, 1 };
-	int batchSize = 1;
-	clFFT_DataFormat dataFormat = clFFT_SplitComplexFormat;
-	clFFT_Dimension dim = clFFT_1D;
-	clFFT_TestType testType = clFFT_OUT_OF_PLACE;
-	cl_device_id device_ids[16];
-	
-	FILE *paramFile;
-			
-	cl_int err;
-	unsigned int num_devices;
-	
-	cl_device_type device_type = getGlobalDeviceType();	
-	if(device_type != CL_DEVICE_TYPE_GPU) 
-	{
-		log_info("Test only supported on DEVICE_TYPE_GPU\n");
-		test_finish();
-		exit(0);
-	}
-	
-	err = clGetDeviceIDs(NULL, device_type, sizeof(device_ids), device_ids, &num_devices);
-	if(err) 
-	{		
-		log_error("clGetComputeDevice failed\n");
-		test_finish();
-		return -1;
-	}
-	
-	device_id = NULL;
-	
-	unsigned int i;
-	for(i = 0; i < num_devices; i++)
-	{
-	    cl_bool available;
-	    err = clGetDeviceInfo(device_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL);
-	    if(err)
-	    {
-	         log_error("Cannot check device availability of device # %d\n", i);
-	    }
-	    
-	    if(available)
-	    {
-	        device_id = device_ids[i];
-	        break;
-	    }
-	    else
-	    {
-	        char name[200];
-	        err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(name), name, NULL);
-	        if(err == CL_SUCCESS)
-	        {
-	             log_info("Device %s not available for compute\n", name);
-	        }
-	        else
-	        {
-	             log_info("Device # %d not available for compute\n", i);
-	        }
-	    }
-	}
-	
-	if(!device_id)
-	{
-	    log_error("None of the devices available for compute ... aborting test\n");
-	    test_finish();
-	    return -1;
-	}
-	
-	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
-	if(!context || err) 
-	{
-		log_error("clCreateContext failed\n");
-		test_finish();
-		return -1;
-	}
-	
-    queue = clCreateCommandQueue(context, device_id, 0, &err);
-    if(!queue || err)
-	{
-        log_error("clCreateCommandQueue() failed.\n");
-		clReleaseContext(context);
-        test_finish();
-        return -1;
-    }  
-	
-	err = clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &gMemSize, NULL);
-	if(err)
-	{
-		log_error("Failed to get global mem size\n");
-		clReleaseContext(context);
-		clReleaseCommandQueue(queue);
-		test_finish();
-		return -2;
-	}
-	
-	gMemSize /= (1024*1024);
-			
-	char delim[] = " \n";
-	char tmpStr[100];
-	char line[200];
-	char *param, *val;	
-	int total_errors = 0;
-	if(argc == 1) {
-		log_error("Need file name with list of parameters to run the test\n");
-		test_finish();
-		return -1;
-	}
-	
-	if(argc == 2) {	// arguments are supplied in a file with arguments for a single run are all on the same line
-		paramFile = fopen(argv[1], "r");
-		if(!paramFile) {
-			log_error("Cannot open the parameter file\n");
-			clReleaseContext(context);
-			clReleaseCommandQueue(queue);			
-			test_finish();
-			return -3;
-		}
-		while(fgets(line, 199, paramFile)) {
-			if(!strcmp(line, "") || !strcmp(line, "\n") || ifLineCommented(line))
-				continue;
-			param = strtok(line, delim);
-			while(param) {
-				val = strtok(NULL, delim);
-				if(!strcmp(param, "-n")) {
-					sscanf(val, "%d", &n.x);
-					val = strtok(NULL, delim);
-					sscanf(val, "%d", &n.y);
-					val = strtok(NULL, delim);
-					sscanf(val, "%d", &n.z);					
-				}
-				else if(!strcmp(param, "-batchsize")) 
-					sscanf(val, "%d", &batchSize);
-				else if(!strcmp(param, "-dir")) {
-					sscanf(val, "%s", tmpStr);
-					if(!strcmp(tmpStr, "forward"))
-						dir = clFFT_Forward;
-					else if(!strcmp(tmpStr, "inverse"))
-						dir = clFFT_Inverse;
-				}
-				else if(!strcmp(param, "-dim")) {
-					sscanf(val, "%s", tmpStr);
-					if(!strcmp(tmpStr, "1D"))
-						dim = clFFT_1D;
-					else if(!strcmp(tmpStr, "2D"))
-						dim = clFFT_2D; 
-					else if(!strcmp(tmpStr, "3D"))
-						dim = clFFT_3D;					
-				}
-				else if(!strcmp(param, "-format")) {
-					sscanf(val, "%s", tmpStr);
-					if(!strcmp(tmpStr, "plannar"))
-						dataFormat = clFFT_SplitComplexFormat;
-					else if(!strcmp(tmpStr, "interleaved"))
-						dataFormat = clFFT_InterleavedComplexFormat;					
-				}
-				else if(!strcmp(param, "-numiter"))
-					sscanf(val, "%d", &numIter);
-				else if(!strcmp(param, "-testtype")) {
-					sscanf(val, "%s", tmpStr);
-					if(!strcmp(tmpStr, "out-of-place"))
-						testType = clFFT_OUT_OF_PLACE;
-					else if(!strcmp(tmpStr, "in-place"))
-						testType = clFFT_IN_PLACE;										
-				}
-				param = strtok(NULL, delim);
-			}
-			
-			if(checkMemRequirements(n, batchSize, testType, gMemSize)) {
-				log_info("This test cannot run because memory requirements canot be met by the available device\n");
-				continue;
-			}
-				
-			err = runTest(n, batchSize, dir, dim, dataFormat, numIter, testType);
-			if (err)
-				total_errors++;
-		}
-	}
-	
-	clReleaseContext(context);
-	clReleaseCommandQueue(queue);
-	
-	test_finish();
-	return total_errors;		
-}
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/param.txt b/RTCP/Cobalt/OpenCL_FFT/src/param.txt
deleted file mode 100644
index 595402aa437b07baa8936280909927a09eb6f6dd..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/param.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-
-//
-// File:       param.txt
-//
-// Version:    <1.0>
-//
-// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
-//             in consideration of your agreement to the following terms, and your use,
-//             installation, modification or redistribution of this Apple software
-//             constitutes acceptance of these terms.  If you do not agree with these
-//             terms, please do not use, install, modify or redistribute this Apple
-//             software.
-//
-//             In consideration of your agreement to abide by the following terms, and
-//             subject to these terms, Apple grants you a personal, non - exclusive
-//             license, under Apple's copyrights in this original Apple software ( the
-//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
-//             Software, with or without modifications, in source and / or binary forms;
-//             provided that if you redistribute the Apple Software in its entirety and
-//             without modifications, you must retain this notice and the following text
-//             and disclaimers in all such redistributions of the Apple Software. Neither
-//             the name, trademarks, service marks or logos of Apple Inc. may be used to
-//             endorse or promote products derived from the Apple Software without specific
-//             prior written permission from Apple.  Except as expressly stated in this
-//             notice, no other rights or licenses, express or implied, are granted by
-//             Apple herein, including but not limited to any patent rights that may be
-//             infringed by your derivative works or by other works in which the Apple
-//             Software may be incorporated.
-//
-//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
-//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
-//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
-//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
-//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
-//
-//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
-//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
-//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
-//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
-//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
--n 64 1 1 -batchsize 8192 -dir forward -dim 1D -format plannar -numiter 1000 -testtype out-of-place
--n 1024 1 1 -batchsize 8192 -dir forward -dim 1D -format plannar -numiter 1000 -testtype out-of-place
--n 1048576 1 1 -batchsize 4 -dir inverse -dim 1D -format interleaved -numiter 1000 -testtype out-of-place
--n 1024 512 1 -batchsize 8 -dir forward -dim 2D -format interleaved -numiter 1000 -testtype out-of-place
--n 128 128 128 -batchsize 1 -dir inverse -dim 3D -format interleaved -numiter 1000 -testtype out-of-place
--n 16384 1 1 -batchsize 4 -dir forward -dim 1D -format interleaved -numiter 1 -testtype in-place
--n 32 2048 1 -batchsize 8 -dir forward -dim 2D -format interleaved -numiter 1 -testtype in-place
--n 4096 64 1 -batchsize 4 -dir inverse -dim 2D -format plannar -numiter 1 -testtype in-place
--n 64 32 16 -batchsize 1 -dir inverse -dim 3D -format interleaved -numiter 1 -testtype out-of-place
diff --git a/RTCP/Cobalt/OpenCL_FFT/src/procs.h b/RTCP/Cobalt/OpenCL_FFT/src/procs.h
deleted file mode 100644
index f6028c9e5218900accc97fa3dddc19fed885e7df..0000000000000000000000000000000000000000
--- a/RTCP/Cobalt/OpenCL_FFT/src/procs.h
+++ /dev/null
@@ -1,53 +0,0 @@
-
-//
-// File:       procs.h
-//
-// Version:    <1.0>
-//
-// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
-//             in consideration of your agreement to the following terms, and your use,
-//             installation, modification or redistribution of this Apple software
-//             constitutes acceptance of these terms.  If you do not agree with these
-//             terms, please do not use, install, modify or redistribute this Apple
-//             software.
-//
-//             In consideration of your agreement to abide by the following terms, and
-//             subject to these terms, Apple grants you a personal, non - exclusive
-//             license, under Apple's copyrights in this original Apple software ( the
-//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
-//             Software, with or without modifications, in source and / or binary forms;
-//             provided that if you redistribute the Apple Software in its entirety and
-//             without modifications, you must retain this notice and the following text
-//             and disclaimers in all such redistributions of the Apple Software. Neither
-//             the name, trademarks, service marks or logos of Apple Inc. may be used to
-//             endorse or promote products derived from the Apple Software without specific
-//             prior written permission from Apple.  Except as expressly stated in this
-//             notice, no other rights or licenses, express or implied, are granted by
-//             Apple herein, including but not limited to any patent rights that may be
-//             infringed by your derivative works or by other works in which the Apple
-//             Software may be incorporated.
-//
-//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
-//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
-//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
-//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
-//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
-//
-//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
-//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
-//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
-//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
-//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-#define test_start()
-#define log_perf(_number, _higherBetter, _numType, _format, ...) printf("Performance Number " _format " (in %s, %s): %g\n",##__VA_ARGS__, _numType, _higherBetter?"higher is better":"lower is better" , _number)
-#define log_info printf
-#define log_error printf
-#define test_finish()