diff --git a/apps/gpu/LBM/MusselOyster/MusselOyster.cpp b/apps/gpu/LBM/MusselOyster/MusselOyster.cpp index dc7ac1f193b9a6e4475f575ea945add642c52b21..783ea86dfbd066b8a6105621f003b051480754c8 100644 --- a/apps/gpu/LBM/MusselOyster/MusselOyster.cpp +++ b/apps/gpu/LBM/MusselOyster/MusselOyster.cpp @@ -132,8 +132,8 @@ void multipleLevel(const std::string& configPath) *logging::out << logging::Logger::INFO_HIGH << "velocity real [m/s] = " << vxLB * para->getVelocityRatio()<< " \n"; *logging::out << logging::Logger::INFO_HIGH << "viscosity real [m^2/s] = " << viscosityLB * para->getViscosityRatio() << "\n"; - para->setTOut(100); - para->setTEnd(100); + para->setTOut(11); + para->setTEnd(10); para->setCalcDragLift(false); para->setUseWale(false); diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp index d4d0ccdbeae290b7a29a4e27fbcdd3af7273a87f..e05186fa7750f672efbc0ed7f1991ace14104a66 100644 --- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp +++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp @@ -26,23 +26,31 @@ void updateGrid27(Parameter* para, ////////////////////////////////////////////////////////////////////////// - if (para->getUseStreams()) + if (para->getUseStreams()) { + // launch border kernel collisionUsingIndex(para, pm, level, t, kernels, para->getParD(level)->fluidNodeIndicesBorder, para->getParD(level)->numberOffluidNodesBorder, 1); - else + para->getStreamManager().createCudaEvents(); + } else collision(para, pm, level, t, kernels); ////////////////////////////////////////////////////////////////////////// - if (para->getUseStreams()) - exchangeMultiGPU(para, comm, cudaManager, level, 1); - else - exchangeMultiGPU(para, comm, cudaManager, level, -1); + if (para->getUseStreams()) { + prepareExchangeMultiGPU(para, level, 1); - if (para->getUseStreams()) + // launch bulk kernel + para->getStreamManager().waitOnStartBulkKernelEvent(0); collisionUsingIndex(para, pm, level, t, kernels, para->getParD(level)->fluidNodeIndices, para->getParD(level)->numberOfFluidNodes, 0); + exchangeMultiGPU(para, comm, cudaManager, level, 1); + } else { + prepareExchangeMultiGPU(para, level, -1); + exchangeMultiGPU(para, comm, cudaManager, level, -1); + } + + ////////////////////////////////////////////////////////////////////////// postCollisionBC(para, level, t); @@ -66,10 +74,14 @@ void updateGrid27(Parameter* para, { fineToCoarse(para, level); + prepareExchangeMultiGPU(para, level, -1); exchangeMultiGPU(para, comm, cudaManager, level, -1); coarseToFine(para, level); - } + } + + if (para->getUseStreams()) + para->getStreamManager().destroyCudaEvents(); } void collision(Parameter* para, std::vector<std::shared_ptr<PorousMedia>>& pm, int level, unsigned int t, std::vector < SPtr< Kernel>>& kernels) @@ -181,6 +193,13 @@ void collisionAdvectionDiffusion(Parameter* para, int level) } } +void prepareExchangeMultiGPU(Parameter *para, int level, int streamIndex) +{ + if (para->getNumprocs() > 1) { + prepareExchangePostCollDataYGPU27(para, level, streamIndex); + } +} + void exchangeMultiGPU(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, int streamIndex) { diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h index 8e9fbff0414cf5e211957822089f8b2274e2cba0..39a985435e04c3e406b5293462ee3ae60bb91486 100644 --- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h +++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h @@ -26,6 +26,8 @@ extern "C" void collisionPorousMedia(Parameter* para, std::vector<std::shared_pt extern "C" void collisionAdvectionDiffusion(Parameter* para, int level); +extern "C" void prepareExchangeMultiGPU(Parameter *para, int level, int streamIndex); + extern "C" void exchangeMultiGPU(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, int streamIndex); diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp index 6009bf9f57f3f7b7885c3d716533759ea72f63cb..bbc82854c71362d09a28b4d8b8f553d81666b3f8 100644 --- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp +++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp @@ -198,6 +198,10 @@ void exchangePreCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, Cud //////////////////////////////////////////////////////////////////////////); cudaManager->cudaCopyProcessNeighborYFsDH(level, i, streamIndex); } + + //if (para->getUseStreams() && startBulkKernel!=nullptr) + // cudaEventRecord(*startBulkKernel, stream); + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //start non blocking MPI receive for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) @@ -264,7 +268,21 @@ void exchangePreCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, Cud } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +void prepareExchangePostCollDataYGPU27(Parameter *para, int level, int streamIndex) +{ + cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager().getStream(streamIndex); + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) + GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], para->getParD(level)->sendProcessNeighborY[i].f[0], + para->getParD(level)->sendProcessNeighborY[i].index, + para->getParD(level)->sendProcessNeighborY[i].numberOfNodes, + para->getParD(level)->neighborX_SP, para->getParD(level)->neighborY_SP, + para->getParD(level)->neighborZ_SP, para->getParD(level)->size_Mat_SP, + para->getParD(level)->evenOrOdd, para->getParD(level)->numberofthreads, stream); + if (para->getUseStreams()) + para->getStreamManager().triggerStartBulkKernel(streamIndex); +} + void exchangePostCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, int streamIndex) { @@ -272,21 +290,8 @@ void exchangePostCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, Cu /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //copy Device to Host for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { - GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], - para->getParD(level)->sendProcessNeighborY[i].f[0], - para->getParD(level)->sendProcessNeighborY[i].index, - para->getParD(level)->sendProcessNeighborY[i].numberOfNodes, - para->getParD(level)->neighborX_SP, - para->getParD(level)->neighborY_SP, - para->getParD(level)->neighborZ_SP, - para->getParD(level)->size_Mat_SP, - para->getParD(level)->evenOrOdd, - para->getParD(level)->numberofthreads, - stream); - ////////////////////////////////////////////////////////////////////////// cudaManager->cudaCopyProcessNeighborYFsDH(level, i, streamIndex); - } + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //start non blocking MPI receive for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h index bf4cddf7462167ce28bb16ad07761ee142fb1e7a..b1da42cd9371c16c7168bbe471282612af04fe95 100644 --- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h +++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h @@ -18,6 +18,7 @@ extern "C" void exchangePreCollDataYGPU27(Parameter *para, vf::gpu::Communicator int level, int streamIndex = -1); extern "C" void exchangePreCollDataZGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); extern "C" void exchangePostCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level); +extern "C" void prepareExchangePostCollDataYGPU27(Parameter *para, int level, int streamIndex = -1); extern "C" void exchangePostCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, int streamIndex = -1); extern "C" void exchangePostCollDataZGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); diff --git a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp index 437e36299e6dec0f39141708152af8f1b17d0532..b702430e0efe4a095376de04b8dfb000e9f0403a 100644 --- a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp +++ b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp @@ -28,6 +28,7 @@ // //======================================================================================= #include "CudaStreamManager.h" +#include <helper_cuda.h> CudaStreamManager::CudaStreamManager() {} @@ -48,5 +49,22 @@ void CudaStreamManager::terminateStreams() cudaStream_t &CudaStreamManager::getStream(uint streamIndex) { - return cudaStreams[streamIndex]; -} \ No newline at end of file + return cudaStreams[streamIndex]; } + +void CudaStreamManager::createCudaEvents() +{ + checkCudaErrors(cudaEventCreateWithFlags(&startBulkKernel, cudaEventDisableTiming)); +} + +void CudaStreamManager::destroyCudaEvents() {checkCudaErrors(cudaEventDestroy(startBulkKernel)); +} + +void CudaStreamManager::triggerStartBulkKernel(int streamIndex) +{ + checkCudaErrors(cudaEventRecord(startBulkKernel, cudaStreams[streamIndex])); +} + +void CudaStreamManager::waitOnStartBulkKernelEvent(int streamIndex) +{ + checkCudaErrors(cudaStreamWaitEvent(cudaStreams[streamIndex], startBulkKernel)); +} diff --git a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h index 3912a383da77619cb742639b047735f0347fced2..8b16b9f3d269970ebe54f8f9e6ec42e6f231636e 100644 --- a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h +++ b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h @@ -40,6 +40,7 @@ class CudaStreamManager { private: std::vector<cudaStream_t> cudaStreams; + cudaEvent_t startBulkKernel; public: CudaStreamManager(); @@ -47,6 +48,11 @@ public: void launchStreams(uint numberOfStreams); void terminateStreams(); cudaStream_t &getStream(uint streamIndex); + + void createCudaEvents(); + void destroyCudaEvents(); + void triggerStartBulkKernel(int streamIndex); + void waitOnStartBulkKernelEvent(int streamIndex); }; -#endif \ No newline at end of file +#endif