Skip to content
Snippets Groups Projects
Commit f663f631 authored by Bram Veenboer's avatar Bram Veenboer
Browse files

Add benchmark_hybrid

parent e04d759a
No related branches found
No related tags found
No related merge requests found
...@@ -421,6 +421,60 @@ void benchmark_allocmanaged(const char *name, int n, float v_input_1, ...@@ -421,6 +421,60 @@ void benchmark_allocmanaged(const char *name, int n, float v_input_1,
CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_output))); CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_output)));
} }
void benchmark_hybrid(const char *name, int n, float v_input_1, float v_input_2,
float v_output, float v_output_reference, CUstream stream,
CUdevice device, int num_repeats, int num_warmups) {
float *d_input_1, *d_input_2, *d_output;
float *h_output;
CHECK_CUDA_ERROR(cuMemAllocManaged(
reinterpret_cast<CUdeviceptr *>(reinterpret_cast<void **>(&d_input_1)),
n * sizeof(float), CU_MEM_ATTACH_GLOBAL));
CHECK_CUDA_ERROR(cuMemAllocManaged(
reinterpret_cast<CUdeviceptr *>(reinterpret_cast<void **>(&d_input_2)),
n * sizeof(float), CU_MEM_ATTACH_GLOBAL));
CHECK_CUDA_ERROR(cuMemAlloc(
reinterpret_cast<CUdeviceptr *>(reinterpret_cast<void **>(&d_output)),
n * sizeof(float)));
CHECK_CUDA_ERROR(cuMemHostAlloc(reinterpret_cast<void **>(&h_output),
n * sizeof(float), 0));
std::function<void()> function_input = [&]() {
CHECK_CUDA_ERROR(cuMemPrefetchAsync(
reinterpret_cast<CUdeviceptr>(reinterpret_cast<void *>(d_input_1)),
n * sizeof(float), device, stream));
CHECK_CUDA_ERROR(cuMemPrefetchAsync(
reinterpret_cast<CUdeviceptr>(reinterpret_cast<void *>(d_input_2)),
n * sizeof(float), device, stream));
};
std::function<void()> function_kernel = [&]() {
dim3 const threads_per_block{THREADS_PER_BLOCk};
dim3 const blocks_per_grid{BLOCKS_PER_GRID};
float_addition<<<blocks_per_grid, threads_per_block, 0, stream>>>(
d_output, d_input_1, d_input_2, n);
CHECK_LAST_CUDA_ERROR();
};
std::function<void()> function_output = [&]() {
CHECK_CUDA_ERROR(cuMemcpyDtoHAsync(h_output,
reinterpret_cast<CUdeviceptr>(d_output),
n * sizeof(float), stream));
CHECK_CUDA_ERROR(cuStreamSynchronize(stream));
};
run_benchmark(name, d_input_1, d_input_2, h_output, v_input_1, v_input_2,
v_output, d_output, v_output_reference, n, function_input,
function_kernel, function_output, stream, num_repeats,
num_warmups);
report_device_memory(device);
std::cout << std::endl;
CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_input_1)));
CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_input_2)));
CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_output)));
CHECK_CUDA_ERROR(cuMemFreeHost(h_output));
}
int main() { int main() {
constexpr int const num_repeats{3}; constexpr int const num_repeats{3};
constexpr int const num_warmups{1}; constexpr int const num_warmups{1};
...@@ -512,6 +566,12 @@ int main() { ...@@ -512,6 +566,12 @@ int main() {
prefetch ? 0 : num_warmups, flags, prefetch); prefetch ? 0 : num_warmups, flags, prefetch);
} }
} }
if (property) {
benchmark_hybrid("hybrid", n, v_input_1, v_input_2, v_output,
v_output_reference, stream, device, num_repeats,
num_warmups);
}
} }
CHECK_CUDA_ERROR(cuStreamDestroy(stream)); CHECK_CUDA_ERROR(cuStreamDestroy(stream));
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment