diff --git a/main.cu b/main.cu
index 295df526aad7531d8f162ecc8775efda5439af77..42b05e94d3b0e20c60d89ebc2e01607ff841592c 100644
--- a/main.cu
+++ b/main.cu
@@ -421,6 +421,60 @@ void benchmark_allocmanaged(const char *name, int n, float v_input_1,
   CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_output)));
 }
 
+void benchmark_hybrid(const char *name, int n, float v_input_1, float v_input_2,
+                      float v_output, float v_output_reference, CUstream stream,
+                      CUdevice device, int num_repeats, int num_warmups) {
+  float *d_input_1, *d_input_2, *d_output;
+  float *h_output;
+
+  CHECK_CUDA_ERROR(cuMemAllocManaged(
+      reinterpret_cast<CUdeviceptr *>(reinterpret_cast<void **>(&d_input_1)),
+      n * sizeof(float), CU_MEM_ATTACH_GLOBAL));
+  CHECK_CUDA_ERROR(cuMemAllocManaged(
+      reinterpret_cast<CUdeviceptr *>(reinterpret_cast<void **>(&d_input_2)),
+      n * sizeof(float), CU_MEM_ATTACH_GLOBAL));
+  CHECK_CUDA_ERROR(cuMemAlloc(
+      reinterpret_cast<CUdeviceptr *>(reinterpret_cast<void **>(&d_output)),
+      n * sizeof(float)));
+  CHECK_CUDA_ERROR(cuMemHostAlloc(reinterpret_cast<void **>(&h_output),
+                                  n * sizeof(float), 0));
+
+  std::function<void()> function_input = [&]() {
+    CHECK_CUDA_ERROR(cuMemPrefetchAsync(
+        reinterpret_cast<CUdeviceptr>(reinterpret_cast<void *>(d_input_1)),
+        n * sizeof(float), device, stream));
+    CHECK_CUDA_ERROR(cuMemPrefetchAsync(
+        reinterpret_cast<CUdeviceptr>(reinterpret_cast<void *>(d_input_2)),
+        n * sizeof(float), device, stream));
+  };
+  std::function<void()> function_kernel = [&]() {
+    dim3 const threads_per_block{THREADS_PER_BLOCk};
+    dim3 const blocks_per_grid{BLOCKS_PER_GRID};
+    float_addition<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+        d_output, d_input_1, d_input_2, n);
+    CHECK_LAST_CUDA_ERROR();
+  };
+  std::function<void()> function_output = [&]() {
+    CHECK_CUDA_ERROR(cuMemcpyDtoHAsync(h_output,
+                                       reinterpret_cast<CUdeviceptr>(d_output),
+                                       n * sizeof(float), stream));
+    CHECK_CUDA_ERROR(cuStreamSynchronize(stream));
+  };
+
+  run_benchmark(name, d_input_1, d_input_2, h_output, v_input_1, v_input_2,
+                v_output, d_output, v_output_reference, n, function_input,
+                function_kernel, function_output, stream, num_repeats,
+                num_warmups);
+
+  report_device_memory(device);
+  std::cout << std::endl;
+
+  CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_input_1)));
+  CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_input_2)));
+  CHECK_CUDA_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(d_output)));
+  CHECK_CUDA_ERROR(cuMemFreeHost(h_output));
+}
+
 int main() {
   constexpr int const num_repeats{3};
   constexpr int const num_warmups{1};
@@ -512,6 +566,12 @@ int main() {
                                prefetch ? 0 : num_warmups, flags, prefetch);
       }
     }
+
+    if (property) {
+      benchmark_hybrid("hybrid", n, v_input_1, v_input_2, v_output,
+                       v_output_reference, stream, device, num_repeats,
+                       num_warmups);
+    }
   }
 
   CHECK_CUDA_ERROR(cuStreamDestroy(stream));