From bb6ef104b59cc6a609335d28128575b06fadbf3a Mon Sep 17 00:00:00 2001 From: Anna Wellmann <a.wellmann@tu-braunschweig.de> Date: Tue, 24 Aug 2021 10:53:31 +0200 Subject: [PATCH] Use streams also for exchange in x- and z-directions --- apps/gpu/LBM/MusselOyster/MusselOyster.cpp | 2 +- .../Calculation/UpdateGrid27.cpp | 10 ++- .../Communication/ExchangeData27.cpp | 79 ++++++++++++------- .../GPU/CudaMemoryManager.cpp | 78 ++++++++++++------ .../VirtualFluids_GPU/GPU/CudaMemoryManager.h | 10 +-- 5 files changed, 118 insertions(+), 61 deletions(-) diff --git a/apps/gpu/LBM/MusselOyster/MusselOyster.cpp b/apps/gpu/LBM/MusselOyster/MusselOyster.cpp index c521944ad..d68668340 100644 --- a/apps/gpu/LBM/MusselOyster/MusselOyster.cpp +++ b/apps/gpu/LBM/MusselOyster/MusselOyster.cpp @@ -132,7 +132,7 @@ void multipleLevel(const std::string& configPath) *logging::out << logging::Logger::INFO_HIGH << "velocity real [m/s] = " << vxLB * para->getVelocityRatio()<< " \n"; *logging::out << logging::Logger::INFO_HIGH << "viscosity real [m^2/s] = " << viscosityLB * para->getViscosityRatio() << "\n"; - para->setTOut(11); + para->setTOut(10); para->setTEnd(10); para->setCalcDragLift(false); diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp index 62e57eeb0..b7de4658f 100644 --- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp +++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp @@ -196,7 +196,9 @@ void collisionAdvectionDiffusion(Parameter* para, int level) void prepareExchangeMultiGPU(Parameter *para, int level, int streamIndex) { if (para->getNumprocs() > 1) { + prepareExchangePostCollDataXGPU27(para, level, streamIndex); prepareExchangePostCollDataYGPU27(para, level, streamIndex); + prepareExchangePostCollDataZGPU27(para, level, streamIndex); } } @@ -209,14 +211,16 @@ void exchangeMultiGPU(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryMa ////////////////////////////////////////////////////////////////////////// //3D domain decomposition - exchangePostCollDataXGPU27(para, comm, cudaManager, level); + exchangePostCollDataXGPU27(para, comm, cudaManager, level, streamIndex); exchangePostCollDataYGPU27(para, comm, cudaManager, level, streamIndex); - exchangePostCollDataZGPU27(para, comm, cudaManager, level); + exchangePostCollDataZGPU27(para, comm, cudaManager, level, streamIndex); ////////////////////////////////////////////////////////////////////////// //3D domain decomposition convection diffusion - if (para->getDiffOn()==true) + if (para->getDiffOn()) { + if (para->getUseStreams()) + std::cout << "Warning: Cuda streams not yet implemented for convection diffusion" << std::endl; exchangePostCollDataADXGPU27(para, comm, cudaManager, level); exchangePostCollDataADYGPU27(para, comm, cudaManager, level); exchangePostCollDataADZGPU27(para, comm, cudaManager, level); diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp index 77e8a4800..4bbc63378 100644 --- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp +++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp @@ -88,13 +88,10 @@ // /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //} //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void exchangePostCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, - int streamIndex) +void prepareExchangePostCollDataXGPU27(Parameter *para, int level, int streamIndex) { - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host + cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager().getStream(streamIndex); for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - { GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], para->getParD(level)->sendProcessNeighborX[i].f[0], para->getParD(level)->sendProcessNeighborX[i].index, @@ -104,10 +101,19 @@ void exchangePostCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu para->getParD(level)->neighborZ_SP, para->getParD(level)->size_Mat_SP, para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborXFsDH(level, i); - } + para->getParD(level)->numberofthreads, + stream); +} + +void exchangePostCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, + int streamIndex) +{ + cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager().getStream(streamIndex); + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) + cudaManager->cudaCopyProcessNeighborXFsDH(level, i, streamIndex); + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //start non blocking MPI receive for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) @@ -131,6 +137,10 @@ void exchangePostCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu // comm->waitallGPU(); //} ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // wait for memcopy device to host to finish before sending data + if (para->getUseStreams()) + cudaStreamSynchronize(stream); + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //start blocking MPI send for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) { @@ -154,7 +164,7 @@ void exchangePostCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu //copy Host to Device for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) { - cudaManager->cudaCopyProcessNeighborXFsHD(level, i); + cudaManager->cudaCopyProcessNeighborXFsHD(level, i, streamIndex); ////////////////////////////////////////////////////////////////////////// SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0], para->getParD(level)->recvProcessNeighborX[i].f[0], @@ -165,7 +175,8 @@ void exchangePostCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu para->getParD(level)->neighborZ_SP, para->getParD(level)->size_Mat_SP, para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); + para->getParD(level)->numberofthreads, + stream); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } @@ -274,12 +285,17 @@ void prepareExchangePostCollDataYGPU27(Parameter *para, int level, int streamInd { cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager().getStream(streamIndex); for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], para->getParD(level)->sendProcessNeighborY[i].f[0], + GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], + para->getParD(level)->sendProcessNeighborY[i].f[0], para->getParD(level)->sendProcessNeighborY[i].index, para->getParD(level)->sendProcessNeighborY[i].numberOfNodes, - para->getParD(level)->neighborX_SP, para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, para->getParD(level)->numberofthreads, stream); + para->getParD(level)->neighborX_SP, + para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, + para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, + para->getParD(level)->numberofthreads, + stream); } void exchangePostCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, @@ -447,13 +463,9 @@ void exchangePostCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu // /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //} //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void exchangePostCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, - int streamIndex) -{ - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host +void prepareExchangePostCollDataZGPU27(Parameter *para, int level, int streamIndex) { + cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager().getStream(streamIndex); for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - { GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], para->getParD(level)->sendProcessNeighborZ[i].f[0], para->getParD(level)->sendProcessNeighborZ[i].index, @@ -463,10 +475,18 @@ void exchangePostCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu para->getParD(level)->neighborZ_SP, para->getParD(level)->size_Mat_SP, para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); - ////////////////////////////////////////////////////////////////////////// - cudaManager->cudaCopyProcessNeighborZFsDH(level, i); - } + para->getParD(level)->numberofthreads, + stream); +} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +void exchangePostCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, + int streamIndex) +{ + cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager().getStream(streamIndex); + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //copy Device to Host + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) + cudaManager->cudaCopyProcessNeighborZFsDH(level, i, streamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //start non blocking MPI receive for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) @@ -490,6 +510,10 @@ void exchangePostCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu // comm->waitallGPU(); //} ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // wait for memcopy device to host to finish before sending data + if (para->getUseStreams()) + cudaStreamSynchronize(stream); + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //start blocking MPI send for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) { @@ -513,7 +537,7 @@ void exchangePostCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu //copy Host to Device for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) { - cudaManager->cudaCopyProcessNeighborZFsHD(level, i); + cudaManager->cudaCopyProcessNeighborZFsHD(level, i, streamIndex); ////////////////////////////////////////////////////////////////////////// SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0], para->getParD(level)->recvProcessNeighborZ[i].f[0], @@ -524,7 +548,8 @@ void exchangePostCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu para->getParD(level)->neighborZ_SP, para->getParD(level)->size_Mat_SP, para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads); + para->getParD(level)->numberofthreads, + stream); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp index 53c3221bc..72a17e863 100644 --- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp +++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp @@ -508,19 +508,33 @@ void CudaMemoryManager::cudaCopyProcessNeighborXIndex(int lev, unsigned int proc parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].memsizeIndex, cudaMemcpyHostToDevice)); } -void CudaMemoryManager::cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor) +void CudaMemoryManager::cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor, int streamIndex) { - checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].f[0], - parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0], - parameter->getD3Qxx() * parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].memsizeFs, - cudaMemcpyHostToDevice)); + if (streamIndex == -1) + checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].f[0], + parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0], + parameter->getD3Qxx() * parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].memsizeFs, + cudaMemcpyHostToDevice)); + else + checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].f[0], + parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0], + parameter->getD3Qxx() * parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].memsizeFs, + cudaMemcpyHostToDevice, + parameter->getStreamManager().getStream(streamIndex))); } -void CudaMemoryManager::cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor) +void CudaMemoryManager::cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor, int streamIndex) { - checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborX[processNeighbor].f[0], - parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].f[0], - parameter->getD3Qxx() * parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].memsizeFs, - cudaMemcpyDeviceToHost)); + if (streamIndex == -1) + checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborX[processNeighbor].f[0], + parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].f[0], + parameter->getD3Qxx() * parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].memsizeFs, + cudaMemcpyDeviceToHost)); + else + checkCudaErrors( cudaMemcpyAsync(parameter->getParH(lev)->sendProcessNeighborX[processNeighbor].f[0], + parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].f[0], + parameter->getD3Qxx() * parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].memsizeFs, + cudaMemcpyDeviceToHost, + parameter->getStreamManager().getStream(streamIndex))); } void CudaMemoryManager::cudaFreeProcessNeighborX(int lev, unsigned int processNeighbor) { @@ -567,9 +581,9 @@ void CudaMemoryManager::cudaCopyProcessNeighborYFsHD(int lev, unsigned int proce { if (streamIndex == -1) checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0], - parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0], - parameter->getD3Qxx() * parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].memsizeFs, - cudaMemcpyHostToDevice)); + parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0], + parameter->getD3Qxx() * parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].memsizeFs, + cudaMemcpyHostToDevice)); else checkCudaErrors(cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0], parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0], @@ -631,19 +645,33 @@ void CudaMemoryManager::cudaCopyProcessNeighborZIndex(int lev, unsigned int proc parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].memsizeIndex, cudaMemcpyHostToDevice)); } -void CudaMemoryManager::cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor) +void CudaMemoryManager::cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor, int streamIndex) { - checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].f[0], - parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].f[0], - parameter->getD3Qxx() * parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].memsizeFs, - cudaMemcpyHostToDevice)); -} -void CudaMemoryManager::cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor) -{ - checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborZ[processNeighbor].f[0], - parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].f[0], - parameter->getD3Qxx() * parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].memsizeFs, - cudaMemcpyDeviceToHost)); + if (streamIndex == -1) + checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].f[0], + parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].f[0], + parameter->getD3Qxx() * parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].memsizeFs, + cudaMemcpyHostToDevice)); + else + checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].f[0], + parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].f[0], + parameter->getD3Qxx() * parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].memsizeFs, + cudaMemcpyHostToDevice, + parameter->getStreamManager().getStream(streamIndex))); +} +void CudaMemoryManager::cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor, int streamIndex) +{ + if (streamIndex == -1) + checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborZ[processNeighbor].f[0], + parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].f[0], + parameter->getD3Qxx() * parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].memsizeFs, + cudaMemcpyDeviceToHost)); + else + checkCudaErrors( cudaMemcpyAsync(parameter->getParH(lev)->sendProcessNeighborZ[processNeighbor].f[0], + parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].f[0], + parameter->getD3Qxx() * parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].memsizeFs, + cudaMemcpyDeviceToHost, + parameter->getStreamManager().getStream(streamIndex))); } void CudaMemoryManager::cudaFreeProcessNeighborZ(int lev, unsigned int processNeighbor) { diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h index 1654e140d..e33b9c9ce 100644 --- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h +++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h @@ -89,20 +89,20 @@ public: ////////////////////////////////////////////////////////////////////////// //3D domain decomposition void cudaAllocProcessNeighborX(int lev, unsigned int processNeighbor); - void cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor); - void cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor); + void cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor, int streamIndex); + void cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor, int streamIndex); void cudaCopyProcessNeighborXIndex(int lev, unsigned int processNeighbor); void cudaFreeProcessNeighborX(int lev, unsigned int processNeighbor); // void cudaAllocProcessNeighborY(int lev, unsigned int processNeighbor); - void cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, int streamIndexr); + void cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, int streamIndex); void cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, int streamIndex); void cudaCopyProcessNeighborYIndex(int lev, unsigned int processNeighbor); void cudaFreeProcessNeighborY(int lev, unsigned int processNeighbor); // void cudaAllocProcessNeighborZ(int lev, unsigned int processNeighbor); - void cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor); - void cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor); + void cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor, int streamIndex); + void cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor, int streamIndex); void cudaCopyProcessNeighborZIndex(int lev, unsigned int processNeighbor); void cudaFreeProcessNeighborZ(int lev, unsigned int processNeighbor); ////////////////////////////////////////////////////////////////////////// -- GitLab