diff --git a/main.cu b/main.cu index 295df526aad7531d8f162ecc8775efda5439af77..42b05e94d3b0e20c60d89ebc2e01607ff841592c 100644 --- a/main.cu +++ b/main.cu @@ -421,6 +421,60 @@ void benchmark_allocmanaged(const char *name, int n, float v_input_1, CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_output))); } +void benchmark_hybrid(const char *name, int n, float v_input_1, float v_input_2, + float v_output, float v_output_reference, CUstream stream, + CUdevice device, int num_repeats, int num_warmups) { + float *d_input_1, *d_input_2, *d_output; + float *h_output; + + CHECK_CUDA_ERROR(cuMemAllocManaged( + reinterpret_cast<CUdeviceptr *>(reinterpret_cast<void **>(&d_input_1)), + n * sizeof(float), CU_MEM_ATTACH_GLOBAL)); + CHECK_CUDA_ERROR(cuMemAllocManaged( + reinterpret_cast<CUdeviceptr *>(reinterpret_cast<void **>(&d_input_2)), + n * sizeof(float), CU_MEM_ATTACH_GLOBAL)); + CHECK_CUDA_ERROR(cuMemAlloc( + reinterpret_cast<CUdeviceptr *>(reinterpret_cast<void **>(&d_output)), + n * sizeof(float))); + CHECK_CUDA_ERROR(cuMemHostAlloc(reinterpret_cast<void **>(&h_output), + n * sizeof(float), 0)); + + std::function<void()> function_input = [&]() { + CHECK_CUDA_ERROR(cuMemPrefetchAsync( + reinterpret_cast<CUdeviceptr>(reinterpret_cast<void *>(d_input_1)), + n * sizeof(float), device, stream)); + CHECK_CUDA_ERROR(cuMemPrefetchAsync( + reinterpret_cast<CUdeviceptr>(reinterpret_cast<void *>(d_input_2)), + n * sizeof(float), device, stream)); + }; + std::function<void()> function_kernel = [&]() { + dim3 const threads_per_block{THREADS_PER_BLOCk}; + dim3 const blocks_per_grid{BLOCKS_PER_GRID}; + float_addition<<<blocks_per_grid, threads_per_block, 0, stream>>>( + d_output, d_input_1, d_input_2, n); + CHECK_LAST_CUDA_ERROR(); + }; + std::function<void()> function_output = [&]() { + CHECK_CUDA_ERROR(cuMemcpyDtoHAsync(h_output, + reinterpret_cast<CUdeviceptr>(d_output), + n * sizeof(float), stream)); + CHECK_CUDA_ERROR(cuStreamSynchronize(stream)); + }; + + run_benchmark(name, d_input_1, d_input_2, h_output, v_input_1, v_input_2, + v_output, d_output, v_output_reference, n, function_input, + function_kernel, function_output, stream, num_repeats, + num_warmups); + + report_device_memory(device); + std::cout << std::endl; + + CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_input_1))); + CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_input_2))); + CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_output))); + CHECK_CUDA_ERROR(cuMemFreeHost(h_output)); +} + int main() { constexpr int const num_repeats{3}; constexpr int const num_warmups{1}; @@ -512,6 +566,12 @@ int main() { prefetch ? 0 : num_warmups, flags, prefetch); } } + + if (property) { + benchmark_hybrid("hybrid", n, v_input_1, v_input_2, v_output, + v_output_reference, stream, device, num_repeats, + num_warmups); + } } CHECK_CUDA_ERROR(cuStreamDestroy(stream));