Skip to content
Snippets Groups Projects
Commit 2595a0b2 authored by Mattia Mancini's avatar Mattia Mancini
Browse files

Add stream capture

parent 2a14f6a1
No related branches found
No related tags found
No related merge requests found
......@@ -18,16 +18,30 @@ __global__ void debugPrint(float* data, int size) {
}
}
int main(int argc, char* argv[]) {
constexpr int data_size = 100;
std::array<float, data_size> data_in;
std::array<float, data_size> data_out;
void init_input(float* data, size_t size, float value) {
for (size_t i = 0; i < size; i++) {
data[i] = value;
}
}
for (size_t idx = 0; idx < data_size; idx++) {
data_in[idx] = 3;
data_out[idx] = -1;
void print_array(float* data, size_t size, const std::string& name) {
for (int i = 0; i < size; i++) {
std::cout << name << "[" << i << "] = " << data[i] << std::endl;
}
}
int main(int argc, char* argv[]) {
constexpr int data_size = 5;
std::array<float, data_size> data_in;
std::array<float, data_size> data_out;
dim3 gridDim{15, 1, 1};
dim3 blockDim{1, 1, 1};
init_input(data_in.data(), data_size, 3);
init_input(data_out.data(), data_size, 1);
std::cout << "Input data" << std::endl;
print_array(data_in.data(), data_in.size(), "data_in");
print_array(data_out.data(), data_out.size(), "data_out");
cudaGraph_t graph;
cudaGraphExec_t graph_exec;
cudaStream_t stream;
......@@ -106,6 +120,70 @@ int main(int argc, char* argv[]) {
checkCudaErrors(cudaGraphLaunch(graph_exec, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
checkCudaErrors(cudaGraphExecDestroy(graph_exec));
cudaGraphDebugDotPrint(graph, "graph.dot",
cudaGraphDebugDotFlags::cudaGraphDebugDotFlagsVerbose);
std::cout << "Final result" << std::endl;
for (size_t idx = 0; idx < data_out.size(); idx++) {
std::cout << "data_in[" << idx << "] = " << data_in[idx] << " data_out["
<< idx << "] = " << data_out[idx] << std::endl;
}
/*
Using stream capture to create the graph as suggested in nvidia forum
*/
init_input(data_in.data(), data_in.size(), 3);
init_input(data_out.data(), data_out.size(), 1);
cudaGraph_t graph_cptr;
cudaGraphExec_t graph_exec_cptr;
std::cout << "Now using stream capture to create the graph" << std::endl;
std::cout << "Input data" << std::endl;
print_array(data_in.data(), data_in.size(), "data_in");
print_array(data_out.data(), data_out.size(), "data_out");
void* array_d;
checkCudaErrors(cudaStreamBeginCapture(
stream, cudaStreamCaptureMode::cudaStreamCaptureModeRelaxed));
checkCudaErrors(
cudaMallocAsync(&array_d, sizeof(float) * data_in.size(), stream));
cudaHostFn_t host_func = set_value;
checkCudaErrors(cudaLaunchHostFunc(stream, host_func, data_in.data()));
cudaMemcpy3DParms to_device_pars_cptr = {0};
to_device_pars.dstPos = make_cudaPos(0, 0, 0);
to_device_pars.dstPtr =
make_cudaPitchedPtr(array_d, array_size * sizeof(float), array_size, 1);
to_device_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1);
to_device_pars.kind = cudaMemcpyHostToDevice;
to_device_pars.srcPos = make_cudaPos(0, 0, 0);
to_device_pars.srcPtr = make_cudaPitchedPtr(
data_in.data(), array_size * sizeof(float), array_size, 1);
checkCudaErrors(cudaMemcpy3DAsync(&to_device_pars_cptr, stream));
debugPrint<<<gridDim, blockDim, 0, stream>>>((float*)array_d, array_size);
cudaMemcpy3DParms to_host_pars_cptr = {0};
to_host_pars.dstPos = make_cudaPos(0, 0, 0);
to_host_pars.dstPtr = make_cudaPitchedPtr(
data_out.data(), array_size * sizeof(float), array_size, 1);
to_host_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1);
to_host_pars.kind = cudaMemcpyDeviceToHost;
to_host_pars.srcPos = make_cudaPos(0, 0, 0);
to_host_pars.srcPtr =
make_cudaPitchedPtr(array_d, array_size * sizeof(float), array_size, 1);
checkCudaErrors(cudaMemcpy3DAsync(&to_host_pars_cptr, stream));
checkCudaErrors(cudaFreeAsync(array_d, stream));
checkCudaErrors(cudaStreamEndCapture(stream, &graph_cptr));
cudaGraphDebugDotPrint(graph_cptr, "graph_cptr.dot",
cudaGraphDebugDotFlags::cudaGraphDebugDotFlagsVerbose);
checkCudaErrors(cudaGraphInstantiate(&graph_exec_cptr, graph, NULL, NULL, 0));
checkCudaErrors(cudaGraphLaunch(graph_exec_cptr, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
std::cout << "Final result" << std::endl;
for (size_t idx = 0; idx < data_out.size(); idx++) {
std::cout << "data_in[" << idx << "] = " << data_in[idx] << " data_out["
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment