diff --git a/main.cu b/main.cu index 919448fb06be02280f1f1817108be7f0b929e544..f9b673f02314e10bee0e2fddccfebd966b5390d7 100644 --- a/main.cu +++ b/main.cu @@ -18,16 +18,30 @@ __global__ void debugPrint(float* data, int size) { } } -int main(int argc, char* argv[]) { - constexpr int data_size = 100; - std::array<float, data_size> data_in; - std::array<float, data_size> data_out; +void init_input(float* data, size_t size, float value) { + for (size_t i = 0; i < size; i++) { + data[i] = value; + } +} - for (size_t idx = 0; idx < data_size; idx++) { - data_in[idx] = 3; - data_out[idx] = -1; +void print_array(float* data, size_t size, const std::string& name) { + for (int i = 0; i < size; i++) { + std::cout << name << "[" << i << "] = " << data[i] << std::endl; } +} +int main(int argc, char* argv[]) { + constexpr int data_size = 5; + std::array<float, data_size> data_in; + std::array<float, data_size> data_out; + dim3 gridDim{15, 1, 1}; + dim3 blockDim{1, 1, 1}; + + init_input(data_in.data(), data_size, 3); + init_input(data_out.data(), data_size, 1); + std::cout << "Input data" << std::endl; + print_array(data_in.data(), data_in.size(), "data_in"); + print_array(data_out.data(), data_out.size(), "data_out"); cudaGraph_t graph; cudaGraphExec_t graph_exec; cudaStream_t stream; @@ -106,6 +120,70 @@ int main(int argc, char* argv[]) { checkCudaErrors(cudaGraphLaunch(graph_exec, stream)); checkCudaErrors(cudaStreamSynchronize(stream)); checkCudaErrors(cudaGraphExecDestroy(graph_exec)); + cudaGraphDebugDotPrint(graph, "graph.dot", + cudaGraphDebugDotFlags::cudaGraphDebugDotFlagsVerbose); + std::cout << "Final result" << std::endl; + for (size_t idx = 0; idx < data_out.size(); idx++) { + std::cout << "data_in[" << idx << "] = " << data_in[idx] << " data_out[" + << idx << "] = " << data_out[idx] << std::endl; + } + + /* + Using stream capture to create the graph as suggested in nvidia forum + */ + init_input(data_in.data(), data_in.size(), 3); + init_input(data_out.data(), data_out.size(), 1); + + cudaGraph_t graph_cptr; + cudaGraphExec_t graph_exec_cptr; + std::cout << "Now using stream capture to create the graph" << std::endl; + + std::cout << "Input data" << std::endl; + print_array(data_in.data(), data_in.size(), "data_in"); + print_array(data_out.data(), data_out.size(), "data_out"); + void* array_d; + checkCudaErrors(cudaStreamBeginCapture( + stream, cudaStreamCaptureMode::cudaStreamCaptureModeRelaxed)); + + checkCudaErrors( + cudaMallocAsync(&array_d, sizeof(float) * data_in.size(), stream)); + + cudaHostFn_t host_func = set_value; + checkCudaErrors(cudaLaunchHostFunc(stream, host_func, data_in.data())); + + cudaMemcpy3DParms to_device_pars_cptr = {0}; + to_device_pars.dstPos = make_cudaPos(0, 0, 0); + to_device_pars.dstPtr = + make_cudaPitchedPtr(array_d, array_size * sizeof(float), array_size, 1); + to_device_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1); + to_device_pars.kind = cudaMemcpyHostToDevice; + to_device_pars.srcPos = make_cudaPos(0, 0, 0); + to_device_pars.srcPtr = make_cudaPitchedPtr( + data_in.data(), array_size * sizeof(float), array_size, 1); + + checkCudaErrors(cudaMemcpy3DAsync(&to_device_pars_cptr, stream)); + + debugPrint<<<gridDim, blockDim, 0, stream>>>((float*)array_d, array_size); + cudaMemcpy3DParms to_host_pars_cptr = {0}; + to_host_pars.dstPos = make_cudaPos(0, 0, 0); + to_host_pars.dstPtr = make_cudaPitchedPtr( + data_out.data(), array_size * sizeof(float), array_size, 1); + to_host_pars.extent = make_cudaExtent(sizeof(float) * array_size, 1, 1); + to_host_pars.kind = cudaMemcpyDeviceToHost; + to_host_pars.srcPos = make_cudaPos(0, 0, 0); + to_host_pars.srcPtr = + make_cudaPitchedPtr(array_d, array_size * sizeof(float), array_size, 1); + + checkCudaErrors(cudaMemcpy3DAsync(&to_host_pars_cptr, stream)); + checkCudaErrors(cudaFreeAsync(array_d, stream)); + + checkCudaErrors(cudaStreamEndCapture(stream, &graph_cptr)); + + cudaGraphDebugDotPrint(graph_cptr, "graph_cptr.dot", + cudaGraphDebugDotFlags::cudaGraphDebugDotFlagsVerbose); + checkCudaErrors(cudaGraphInstantiate(&graph_exec_cptr, graph, NULL, NULL, 0)); + checkCudaErrors(cudaGraphLaunch(graph_exec_cptr, stream)); + checkCudaErrors(cudaStreamSynchronize(stream)); std::cout << "Final result" << std::endl; for (size_t idx = 0; idx < data_out.size(); idx++) { std::cout << "data_in[" << idx << "] = " << data_in[idx] << " data_out["