diff --git a/src/gpu/VirtualFluids_GPU/Calculation/CollisisionStrategy.cpp b/src/gpu/VirtualFluids_GPU/Calculation/CollisisionStrategy.cpp index 4a14d19c10936f84379f332ef24f081f0ebb0cb7..6bc2aff337f7531df8e8f495d3b1dd171c81d792 100644 --- a/src/gpu/VirtualFluids_GPU/Calculation/CollisisionStrategy.cpp +++ b/src/gpu/VirtualFluids_GPU/Calculation/CollisisionStrategy.cpp @@ -40,7 +40,7 @@ void CollisionAndExchange_noStreams_indexKernel::operator()(UpdateGrid27 *update //! 1. run collision //! updateGrid->collisionUsingIndices(level, t, para->getParD(level)->fluidNodeIndices, - para->getParD(level)->numberOfFluidNodes, -1); + para->getParD(level)->numberOfFluidNodes); //! 2. exchange information between GPUs updateGrid->exchangeMultiGPU_noStreams_withPrepare(level, false); @@ -61,28 +61,25 @@ void CollisionAndExchange_noStreams_oldKernel::operator()(UpdateGrid27 *updateGr void CollisionAndExchange_streams::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level, unsigned int t) { - int borderStreamIndex = para->getStreamManager()->getBorderStreamIndex(); - int bulkStreamIndex = para->getStreamManager()->getBulkStreamIndex(); - //! \details steps: //! //! 1. run collision for nodes which are at the border of the gpus/processes //! updateGrid->collisionUsingIndices(level, t, para->getParD(level)->fluidNodeIndicesBorder, - para->getParD(level)->numberOfFluidNodesBorder, borderStreamIndex); + para->getParD(level)->numberOfFluidNodesBorder, CudaStreamIndex::Border); //! 2. prepare the exchange between gpus (collect the send nodes for communication in a buffer on the gpu) and trigger bulk kernel execution when finished //! - updateGrid->prepareExchangeMultiGPU(level, borderStreamIndex); + updateGrid->prepareExchangeMultiGPU(level, CudaStreamIndex::Border); if (para->getUseStreams()) - para->getStreamManager()->triggerStartBulkKernel(borderStreamIndex); + para->getStreamManager()->triggerStartBulkKernel(CudaStreamIndex::Border); //! 3. launch the collision kernel for bulk nodes //! - para->getStreamManager()->waitOnStartBulkKernelEvent(bulkStreamIndex); + para->getStreamManager()->waitOnStartBulkKernelEvent(CudaStreamIndex::Bulk); updateGrid->collisionUsingIndices(level, t, para->getParD(level)->fluidNodeIndices, - para->getParD(level)->numberOfFluidNodes, bulkStreamIndex); + para->getParD(level)->numberOfFluidNodes, CudaStreamIndex::Bulk); //! 4. exchange information between GPUs - updateGrid->exchangeMultiGPU(level, borderStreamIndex); + updateGrid->exchangeMultiGPU(level, CudaStreamIndex::Border); } diff --git a/src/gpu/VirtualFluids_GPU/Calculation/RefinementStrategy.cpp b/src/gpu/VirtualFluids_GPU/Calculation/RefinementStrategy.cpp index cd74216e1fbe7b718c72046ace4b7d2e7cf451fe..fe7da04d4074a37321f4340f1d76f0606963b89e 100644 --- a/src/gpu/VirtualFluids_GPU/Calculation/RefinementStrategy.cpp +++ b/src/gpu/VirtualFluids_GPU/Calculation/RefinementStrategy.cpp @@ -38,67 +38,62 @@ void NoRefinement::operator()(UpdateGrid27 *updateGrid, Parameter *para, int lev void RefinementAndExchange_streams_exchangeInterface::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level) { - int borderStreamIndex = para->getStreamManager()->getBorderStreamIndex(); - int bulkStreamIndex = para->getStreamManager()->getBulkStreamIndex(); - //! \details steps: //! //! 1. Interpolation fine to coarse for nodes which are at the border of the gpus/processes //! - updateGrid->fineToCoarse(level, ¶->getParD(level)->intFCBorder, para->getParD(level)->offFC, borderStreamIndex); + updateGrid->fineToCoarse(level, ¶->getParD(level)->intFCBorder, para->getParD(level)->offFC, CudaStreamIndex::Border); //! 2. prepare the exchange between gpus (collect the send nodes for communication in a buffer on the gpu) and trigger bulk kernel execution when finished //! - updateGrid->prepareExchangeMultiGPUAfterFtoC(level, borderStreamIndex); + updateGrid->prepareExchangeMultiGPUAfterFtoC(level, CudaStreamIndex::Border); if (para->getUseStreams()) - para->getStreamManager()->triggerStartBulkKernel(borderStreamIndex); + para->getStreamManager()->triggerStartBulkKernel(CudaStreamIndex::Border); //! 3. launch the bulk kernels for both interpolation processes (fine to coarse and coarse to fine) //! - para->getStreamManager()->waitOnStartBulkKernelEvent(bulkStreamIndex); - updateGrid->fineToCoarse(level, ¶->getParD(level)->intFCBulk, para->getParD(level)->offFCBulk, bulkStreamIndex); - updateGrid->coarseToFine(level, ¶->getParD(level)->intCFBulk, para->getParD(level)->offCFBulk, bulkStreamIndex); + para->getStreamManager()->waitOnStartBulkKernelEvent(CudaStreamIndex::Bulk); + updateGrid->fineToCoarse(level, ¶->getParD(level)->intFCBulk, para->getParD(level)->offFCBulk, CudaStreamIndex::Border); + updateGrid->coarseToFine(level, ¶->getParD(level)->intCFBulk, para->getParD(level)->offCFBulk, CudaStreamIndex::Border); //! 4. exchange information between GPUs (only nodes which are part of the interpolation) //! - updateGrid->exchangeMultiGPUAfterFtoC(level, borderStreamIndex); + updateGrid->exchangeMultiGPUAfterFtoC(level, CudaStreamIndex::Border); // 5. interpolation fine to coarse for nodes which are at the border of the gpus/processes //! - updateGrid->coarseToFine(level, ¶->getParD(level)->intCFBorder, para->getParD(level)->offCF, borderStreamIndex); + updateGrid->coarseToFine(level, ¶->getParD(level)->intCFBorder, para->getParD(level)->offCF, CudaStreamIndex::Border); cudaDeviceSynchronize(); } -void RefinementAndExchange_streams_exchangeAllNodes::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level){ - int borderStreamIndex = para->getStreamManager()->getBorderStreamIndex(); - int bulkStreamIndex = para->getStreamManager()->getBulkStreamIndex(); - +void RefinementAndExchange_streams_exchangeAllNodes::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level) +{ //! \details steps: //! //! 1. interpolation fine to coarse for nodes which are at the border of the gpus/processes //! - updateGrid->fineToCoarse(level, ¶->getParD(level)->intFCBorder, para->getParD(level)->offFC, borderStreamIndex); + updateGrid->fineToCoarse(level, ¶->getParD(level)->intFCBorder, para->getParD(level)->offFC, CudaStreamIndex::Border); //! 2. prepare the exchange between gpus (collect the send nodes for communication in a buffer on the gpu) and trigger bulk kernel execution when finished //! - updateGrid->prepareExchangeMultiGPU(level, borderStreamIndex); + updateGrid->prepareExchangeMultiGPU(level, CudaStreamIndex::Border); if (para->getUseStreams()) - para->getStreamManager()->triggerStartBulkKernel(borderStreamIndex); + para->getStreamManager()->triggerStartBulkKernel(CudaStreamIndex::Border); //! 3. launch the bulk kernels for both interpolation processes (fine to coarse and coarse to fine) //! - para->getStreamManager()->waitOnStartBulkKernelEvent(bulkStreamIndex); - updateGrid->fineToCoarse(level, ¶->getParD(level)->intFCBulk, para->getParD(level)->offFCBulk, bulkStreamIndex); - updateGrid->coarseToFine(level, ¶->getParD(level)->intCFBulk, para->getParD(level)->offCFBulk, bulkStreamIndex); + para->getStreamManager()->waitOnStartBulkKernelEvent(CudaStreamIndex::Bulk); + updateGrid->fineToCoarse(level, ¶->getParD(level)->intFCBulk, para->getParD(level)->offFCBulk, CudaStreamIndex::Border); + updateGrid->coarseToFine(level, ¶->getParD(level)->intCFBulk, para->getParD(level)->offCFBulk, CudaStreamIndex::Border); //! 4. exchange information between GPUs (all nodes) //! - updateGrid->exchangeMultiGPU(level, borderStreamIndex); + updateGrid->exchangeMultiGPU(level, CudaStreamIndex::Border); // 5. interpolation fine to coarse for nodes which are at the border of the gpus/processes //! - updateGrid->coarseToFine(level, ¶->getParD(level)->intCFBorder, para->getParD(level)->offCF, borderStreamIndex); + updateGrid->coarseToFine(level, ¶->getParD(level)->intCFBorder, para->getParD(level)->offCF, CudaStreamIndex::Border); cudaDeviceSynchronize(); } @@ -109,14 +104,14 @@ void RefinementAndExchange_noStreams_exchangeInterface::operator()(UpdateGrid27 //! //! 1. interpolation fine to coarse //! - updateGrid->fineToCoarse(level, ¶->getParD(level)->intFC, para->getParD(level)->offFC, -1); + updateGrid->fineToCoarse(level, ¶->getParD(level)->intFC, para->getParD(level)->offFC, CudaStreamIndex::Legacy); //! 2. exchange information between GPUs (only nodes which are part of the interpolation) //! updateGrid->exchangeMultiGPU_noStreams_withPrepare(level, true); //! 3. interpolation coarse to fine - updateGrid->coarseToFine(level, ¶->getParD(level)->intCF, para->getParD(level)->offCF, -1); + updateGrid->coarseToFine(level, ¶->getParD(level)->intCF, para->getParD(level)->offCF, CudaStreamIndex::Legacy); } void RefinementAndExchange_noStreams_exchangeAllNodes::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level) @@ -125,14 +120,14 @@ void RefinementAndExchange_noStreams_exchangeAllNodes::operator()(UpdateGrid27 * //! //! 1. interpolation fine to coarse //! - updateGrid->fineToCoarse(level, ¶->getParD(level)->intFC, para->getParD(level)->offFC, -1); + updateGrid->fineToCoarse(level, ¶->getParD(level)->intFC, para->getParD(level)->offFC, CudaStreamIndex::Legacy); //! 2. exchange information between GPUs (all nodes) //! updateGrid->exchangeMultiGPU_noStreams_withPrepare(level, false); //! 3. interpolation coarse to fine - updateGrid->coarseToFine(level, ¶->getParD(level)->intCF, para->getParD(level)->offCF, -1); + updateGrid->coarseToFine(level, ¶->getParD(level)->intCF, para->getParD(level)->offCF, CudaStreamIndex::Legacy); } void Refinement_noExchange::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level) @@ -141,7 +136,7 @@ void Refinement_noExchange::operator()(UpdateGrid27 *updateGrid, Parameter *para //! //! 1. interpolation fine to coarse //! - updateGrid->fineToCoarse(level, ¶->getParD(level)->intFC, para->getParD(level)->offFC, -1); + updateGrid->fineToCoarse(level, ¶->getParD(level)->intFC, para->getParD(level)->offFC, CudaStreamIndex::Legacy); //! 2. interpolation coarse to fine - updateGrid->coarseToFine(level, ¶->getParD(level)->intCF, para->getParD(level)->offCF, -1); + updateGrid->coarseToFine(level, ¶->getParD(level)->intCF, para->getParD(level)->offCF, CudaStreamIndex::Legacy); } diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp index 296ab819c5538a6b6d6a6827b5c28cbc475af838..a68ee652ebea282ce90b8774bc20bb9a73f59add 100644 --- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp +++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp @@ -71,7 +71,7 @@ void UpdateGrid27::collisionAllNodes(int level, unsigned int t) collisionAdvectionDiffusion(level); } -void UpdateGrid27::collisionUsingIndices(int level, unsigned int t, uint *fluidNodeIndices, uint numberOfFluidNodes, int stream) +void UpdateGrid27::collisionUsingIndices(int level, unsigned int t, uint *fluidNodeIndices, uint numberOfFluidNodes, CudaStreamIndex stream) { if (fluidNodeIndices != nullptr && numberOfFluidNodes != 0) kernels.at(level)->runOnIndices(fluidNodeIndices, numberOfFluidNodes, stream); @@ -118,21 +118,21 @@ void UpdateGrid27::collisionAdvectionDiffusion(int level) this->adKernelManager->runADcollisionKernel(level); } -void UpdateGrid27::prepareExchangeMultiGPU(int level, int streamIndex) +void UpdateGrid27::prepareExchangeMultiGPU(int level, CudaStreamIndex streamIndex) { prepareExchangeCollDataXGPU27AllNodes(para.get(), level, streamIndex); prepareExchangeCollDataYGPU27AllNodes(para.get(), level, streamIndex); prepareExchangeCollDataZGPU27AllNodes(para.get(), level, streamIndex); } -void UpdateGrid27::prepareExchangeMultiGPUAfterFtoC(int level, int streamIndex) +void UpdateGrid27::prepareExchangeMultiGPUAfterFtoC(int level, CudaStreamIndex streamIndex) { prepareExchangeCollDataXGPU27AfterFtoC(para.get(), level, streamIndex); prepareExchangeCollDataYGPU27AfterFtoC(para.get(), level, streamIndex); prepareExchangeCollDataZGPU27AfterFtoC(para.get(), level, streamIndex); } -void UpdateGrid27::exchangeMultiGPU(int level, int streamIndex) +void UpdateGrid27::exchangeMultiGPU(int level, CudaStreamIndex streamIndex) { ////////////////////////////////////////////////////////////////////////// // 3D domain decomposition @@ -168,30 +168,30 @@ void UpdateGrid27::exchangeMultiGPU_noStreams_withPrepare(int level, bool useRed // 3D domain decomposition if (useReducedComm) { // X - prepareExchangeCollDataXGPU27AfterFtoC(para.get(), level, -1); - exchangeCollDataXGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, -1); - scatterNodesFromRecvBufferXGPU27AfterFtoC(para.get(), level, -1); + prepareExchangeCollDataXGPU27AfterFtoC(para.get(), level, CudaStreamIndex::Legacy); + exchangeCollDataXGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, CudaStreamIndex::Legacy); + scatterNodesFromRecvBufferXGPU27AfterFtoC(para.get(), level, CudaStreamIndex::Legacy); // Y - prepareExchangeCollDataYGPU27AfterFtoC(para.get(), level, -1); - exchangeCollDataYGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, -1); - scatterNodesFromRecvBufferYGPU27AfterFtoC(para.get(), level, -1); + prepareExchangeCollDataYGPU27AfterFtoC(para.get(), level, CudaStreamIndex::Legacy); + exchangeCollDataYGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, CudaStreamIndex::Legacy); + scatterNodesFromRecvBufferYGPU27AfterFtoC(para.get(), level, CudaStreamIndex::Legacy); // Z - prepareExchangeCollDataZGPU27AfterFtoC(para.get(), level, -1); - exchangeCollDataZGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, -1); - scatterNodesFromRecvBufferZGPU27AfterFtoC(para.get(), level, -1); + prepareExchangeCollDataZGPU27AfterFtoC(para.get(), level, CudaStreamIndex::Legacy); + exchangeCollDataZGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, CudaStreamIndex::Legacy); + scatterNodesFromRecvBufferZGPU27AfterFtoC(para.get(), level, CudaStreamIndex::Legacy); } else { // X - prepareExchangeCollDataXGPU27AllNodes(para.get(), level, -1); - exchangeCollDataXGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, -1); - scatterNodesFromRecvBufferXGPU27AllNodes(para.get(), level, -1); + prepareExchangeCollDataXGPU27AllNodes(para.get(), level, CudaStreamIndex::Legacy); + exchangeCollDataXGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, CudaStreamIndex::Legacy); + scatterNodesFromRecvBufferXGPU27AllNodes(para.get(), level, CudaStreamIndex::Legacy); // Y - prepareExchangeCollDataYGPU27AllNodes(para.get(), level, -1); - exchangeCollDataYGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, -1); - scatterNodesFromRecvBufferYGPU27AllNodes(para.get(), level, -1); + prepareExchangeCollDataYGPU27AllNodes(para.get(), level, CudaStreamIndex::Legacy); + exchangeCollDataYGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, CudaStreamIndex::Legacy); + scatterNodesFromRecvBufferYGPU27AllNodes(para.get(), level, CudaStreamIndex::Legacy); // Z - prepareExchangeCollDataZGPU27AllNodes(para.get(), level, -1); - exchangeCollDataZGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, -1); - scatterNodesFromRecvBufferZGPU27AllNodes(para.get(), level, -1); + prepareExchangeCollDataZGPU27AllNodes(para.get(), level, CudaStreamIndex::Legacy); + exchangeCollDataZGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, CudaStreamIndex::Legacy); + scatterNodesFromRecvBufferZGPU27AllNodes(para.get(), level, CudaStreamIndex::Legacy); } ////////////////////////////////////////////////////////////////////////// @@ -204,7 +204,7 @@ void UpdateGrid27::exchangeMultiGPU_noStreams_withPrepare(int level, bool useRed exchangePostCollDataADZGPU27(para.get(), comm, cudaMemoryManager.get(), level); } } -void UpdateGrid27::exchangeMultiGPUAfterFtoC(int level, int streamIndex) +void UpdateGrid27::exchangeMultiGPUAfterFtoC(int level, CudaStreamIndex streamIndex) { ////////////////////////////////////////////////////////////////////////// // 3D domain decomposition @@ -317,13 +317,12 @@ void UpdateGrid27::preCollisionBC(int level, unsigned int t) ////////////////////////////////////////////////////////////////////////////////// } -void UpdateGrid27::fineToCoarse(int level, InterpolationCellFC* icellFC, OffFC &offFC, - int streamIndex) +void UpdateGrid27::fineToCoarse(int level, InterpolationCellFC* icellFC, OffFC &offFC, CudaStreamIndex streamIndex) { gridScalingKernelManager->runFineToCoarseKernelLB(level, icellFC, offFC, streamIndex); if (para->getDiffOn()) { - if (streamIndex != -1) { + if (para->getStreamManager()->streamIsRegistered(streamIndex)) { printf("fineToCoarse Advection Diffusion not implemented"); // TODO return; } @@ -331,14 +330,13 @@ void UpdateGrid27::fineToCoarse(int level, InterpolationCellFC* icellFC, OffFC & } } -void UpdateGrid27::coarseToFine(int level, InterpolationCellCF* icellCF, OffCF &offCF, - int streamIndex) +void UpdateGrid27::coarseToFine(int level, InterpolationCellCF* icellCF, OffCF &offCF, CudaStreamIndex streamIndex) { this->gridScalingKernelManager->runCoarseToFineKernelLB(level, icellCF, offCF, streamIndex); if (para->getDiffOn()) { - if (streamIndex != -1){ + if(para->getStreamManager()->streamIsRegistered(streamIndex)){ printf("CoarseToFineWithStream Advection Diffusion not implemented"); // TODO return; } diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h index 8110923bf066412e2bb09ffa1f10efe3ddc983c7..7bdc25a008ae6c9a0bc34c62d62dae7e87cc23f5 100644 --- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h +++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h @@ -4,6 +4,7 @@ #include "LBM/LB.h" #include "GPU/GPU_Interface.h" #include "Parameter/Parameter.h" +#include "Parameter/CudaStreamManager.h" #include "GPU/CudaMemoryManager.h" #include "Communication/Communicator.h" #include "Calculation/PorousMedia.h" @@ -15,7 +16,6 @@ class Kernel; class BoundaryConditionFactory; class GridScalingFactory; class TurbulenceModelFactory; - class UpdateGrid27; using CollisionStrategy = std::function<void (UpdateGrid27* updateGrid, Parameter* para, int level, unsigned int t)>; using RefinementStrategy = std::function<void (UpdateGrid27* updateGrid, Parameter* para, int level)>; @@ -31,21 +31,21 @@ public: private: void collisionAllNodes(int level, unsigned int t); - void collisionUsingIndices(int level, unsigned int t, uint *fluidNodeIndices = nullptr, uint numberOfFluidNodes = 0, int stream = -1); + void collisionUsingIndices(int level, unsigned int t, uint *fluidNodeIndices = nullptr, uint numberOfFluidNodes = 0, CudaStreamIndex streamIndex=CudaStreamIndex::Legacy); void collisionAdvectionDiffusion(int level); void postCollisionBC(int level); void preCollisionBC(int level, unsigned int t); void collisionPorousMedia(int level); - void fineToCoarse(int level, InterpolationCellFC* icellFC, OffFC &offFC, int streamIndex); - void coarseToFine(int level, InterpolationCellCF* icellCF, OffCF &offCF, int streamIndex); + void fineToCoarse(int level, InterpolationCellFC* icellFC, OffFC &offFC, CudaStreamIndex streamIndex); + void coarseToFine(int level, InterpolationCellCF* icellCF, OffCF &offCF, CudaStreamIndex streamIndex); - void prepareExchangeMultiGPU(int level, int streamIndex); - void prepareExchangeMultiGPUAfterFtoC(int level, int streamIndex); + void prepareExchangeMultiGPU(int level, CudaStreamIndex streamIndex); + void prepareExchangeMultiGPUAfterFtoC(int level, CudaStreamIndex streamIndex); - void exchangeMultiGPU(int level, int streamIndex); - void exchangeMultiGPUAfterFtoC(int level, int streamIndex); + void exchangeMultiGPU(int level, CudaStreamIndex streamIndex); + void exchangeMultiGPUAfterFtoC(int level, CudaStreamIndex streamIndex); void exchangeMultiGPU_noStreams_withPrepare(int level, bool useReducedComm); void swapBetweenEvenAndOddTimestep(int level); diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp index 8986837b502dd6a8b1ec6d8be06318eb4c4d6cc8..7b8950a9334450e9238cae6b2fb08c0f8d441c64 100644 --- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp +++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp @@ -11,11 +11,11 @@ using namespace vf::lbm::dir; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // 3D domain decomposition: functions used by all directions //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex, +void collectNodesInSendBufferGPU(Parameter *para, int level, CudaStreamIndex streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighbor, unsigned int numberOfSendProcessNeighbors) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(streamIndex); for (unsigned int i = 0; i < numberOfSendProcessNeighbors; i++) { GetSendFsPostDev27(para->getParD(level)->distributions.f[0], @@ -32,11 +32,11 @@ void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex, } } -void scatterNodesFromRecvBufferGPU(Parameter *para, int level, int streamIndex, +void scatterNodesFromRecvBufferGPU(Parameter *para, int level, CudaStreamIndex streamIndex, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, unsigned int numberOfRecvProcessNeighbors) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(streamIndex); for (unsigned int i = 0; i < numberOfRecvProcessNeighbors; i++) { SetRecvFsPostDev27(para->getParD(level)->distributions.f[0], (*recvProcessNeighborDev)[i].f[0], @@ -105,22 +105,22 @@ void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeN //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // X //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, int streamIndex) +void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex) { collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborX, (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); } -void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex) { collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborsAfterFtoCX, (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); } void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex) + int level, CudaStreamIndex streamIndex) { - exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, streamIndex, + exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, streamIndex, ¶->getParD(level)->sendProcessNeighborX, ¶->getParD(level)->recvProcessNeighborX, ¶->getParH(level)->sendProcessNeighborX, @@ -128,41 +128,41 @@ void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm } void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex) + int level, CudaStreamIndex streamIndex) { - exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, streamIndex, + exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, streamIndex, ¶->getParD(level)->sendProcessNeighborsAfterFtoCX, ¶->getParD(level)->recvProcessNeighborsAfterFtoCX, ¶->getParH(level)->sendProcessNeighborsAfterFtoCX, ¶->getParH(level)->recvProcessNeighborsAfterFtoCX); } -void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex) { - scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborX, + scatterNodesFromRecvBufferGPU(para, level, streamIndex,¶->getParD(level)->recvProcessNeighborX, (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); } -void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex) { - scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborsAfterFtoCX, + scatterNodesFromRecvBufferGPU(para, level, streamIndex,¶->getParD(level)->recvProcessNeighborsAfterFtoCX, (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); } -void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level, - int streamIndex, +void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, + int level, CudaStreamIndex streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHost) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(streamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //! \details steps: //! 1. copy data from device to host for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - cudaMemoryManager->cudaCopyProcessNeighborXFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); + cudaMemoryManager->cudaCopyProcessNeighborXFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //! 2. start non-blocking receive (MPI) @@ -182,7 +182,7 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //! 7. copy received data from host to device for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - cudaMemoryManager->cudaCopyProcessNeighborXFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); + cudaMemoryManager->cudaCopyProcessNeighborXFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -190,22 +190,22 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Y //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, int streamIndex) +void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex) { collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborY, (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); } -void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex) { collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborsAfterFtoCY, (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); } void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex) + int level, CudaStreamIndex streamIndex) { - exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, streamIndex, + exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, streamIndex, ¶->getParD(level)->sendProcessNeighborY, ¶->getParD(level)->recvProcessNeighborY, ¶->getParH(level)->sendProcessNeighborY, @@ -213,38 +213,39 @@ void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm } void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex) + int level, CudaStreamIndex streamIndex) { - exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, streamIndex, + exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, streamIndex, ¶->getParD(level)->sendProcessNeighborsAfterFtoCY, ¶->getParD(level)->recvProcessNeighborsAfterFtoCY, ¶->getParH(level)->sendProcessNeighborsAfterFtoCY, ¶->getParH(level)->recvProcessNeighborsAfterFtoCY); } -void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex) { scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborY, (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); } -void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex) { scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborsAfterFtoCY, (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); } void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level, - int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, + CudaStreamIndex streamIndex, + std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHost) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamIndex::Border); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // copy Device to Host for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - cudaMemoryManager->cudaCopyProcessNeighborYFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); + cudaMemoryManager->cudaCopyProcessNeighborYFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost); @@ -277,7 +278,7 @@ void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // copy Host to Device for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) { - cudaMemoryManager->cudaCopyProcessNeighborYFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); + cudaMemoryManager->cudaCopyProcessNeighborYFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } @@ -286,61 +287,62 @@ void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Z //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, int streamIndex) +void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex) { collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborZ, (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); } -void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex) { collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborsAfterFtoCZ, (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); } void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex) + int level, CudaStreamIndex streamIndex) { - exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, streamIndex, + exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, streamIndex, ¶->getParD(level)->sendProcessNeighborZ, ¶->getParD(level)->recvProcessNeighborZ, ¶->getParH(level)->sendProcessNeighborZ, ¶->getParH(level)->recvProcessNeighborZ); } void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex) + int level, CudaStreamIndex streamIndex) { - exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, streamIndex, + exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, streamIndex, ¶->getParD(level)->sendProcessNeighborsAfterFtoCZ, ¶->getParD(level)->recvProcessNeighborsAfterFtoCZ, ¶->getParH(level)->sendProcessNeighborsAfterFtoCZ, ¶->getParH(level)->recvProcessNeighborsAfterFtoCZ); } -void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex) { scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborZ, (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); } -void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex) { scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborsAfterFtoCZ, (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level, - int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, +void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level, + CudaStreamIndex streamIndex, + std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHost) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(streamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // copy Device to Host for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - cudaMemoryManager->cudaCopyProcessNeighborZFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); + cudaMemoryManager->cudaCopyProcessNeighborZFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost); ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -387,7 +389,7 @@ void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe // copy Host to Device for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) { - cudaMemoryManager->cudaCopyProcessNeighborZFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); + cudaMemoryManager->cudaCopyProcessNeighborZFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h index ec930ebbc06554e948204b74e79e0e25b85f57b5..8302ffdc47bfa012c47df00f90c2491039f4eaee 100644 --- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h +++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h @@ -6,6 +6,7 @@ #include "GPU/GPU_Interface.h" #include "LBM/LB.h" #include "Parameter/Parameter.h" +#include "Parameter/CudaStreamManager.h" //! \file ExchangeData27.h //! \ingroup GPU @@ -14,9 +15,9 @@ ////////////////////////////////////////////////////////////////////////// // 1D domain decomposition -void exchangePreCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, +void exchangePreCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level); -void exchangePostCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, +void exchangePostCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level); ////////////////////////////////////////////////////////////////////////// // 3D domain decomposition @@ -24,13 +25,13 @@ void exchangePostCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, Cud // functions used for all directions //! \brief Collect the send nodes in a buffer on the gpu -void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex, - std::vector<ProcessNeighbor27> *sendProcessNeighbor, - unsigned int numberOfSendProcessNeighbors); +void collectNodesInSendBufferGPU(Parameter *para, int level, CudaStreamIndex streamIndex, + std::vector<ProcessNeighbor27> *sendProcessNeighbor, + unsigned int numberOfSendProcessNeighbors); //! \brief Distribute the receive nodes from the buffer on the gpu -void scatterNodesFromRecvBufferGPU(Parameter *para, int level, int streamIndex, - std::vector<ProcessNeighbor27> *recvProcessNeighborDev, - unsigned int numberOfRecvProcessNeighbors); +void scatterNodesFromRecvBufferGPU(Parameter *para, int level, CudaStreamIndex streamIndex, + std::vector<ProcessNeighbor27> *recvProcessNeighborDev, + unsigned int numberOfRecvProcessNeighbors); //! \brief Copy nodes which are part of the communication in multiple directions //! \details The nodes are copied from the receive buffer in one direction to the send buffer in another direction. The //! copy operation is conducted on the cpu. @@ -49,21 +50,20 @@ void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeN //! \brief Collect the send nodes for communication in the x direction in a buffer on the gpu //! \details Needed to exchange all nodes, used in the communication after collision step -void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, int streamIndex); +void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex); //! \brief Collect the send nodes for communication in the x direction in a buffer on the gpu //! \details Only exchange nodes which are part of the interpolation process on refined grids. This function is used in //! the exchange which takes place after the interpolation fine to coarse and before the interpolation coarse to fine. //! See [master thesis of Anna Wellmann] -void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, int streamIndex); +void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex); //! \brief Exchange routine in x direction for simulations on multiple gpus //! \details Send and receive the nodes from the communication buffers on the gpus. //! \param Communicator is needed for the communication between the processes with mpi //! \param CudaMemoryManager is needed for moving the data between host and device -//! \param streamIndex is the index of a CUDA Stream, which is needed for communication hiding //! \param sendProcessNeighborDev, recvProcessNeighborDev, sendProcessNeighborHost, recvProcessNeighborHost are pointers //! to the send and receive arrays, both on the device and the host void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex, + int level, CudaStreamIndex streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, @@ -71,59 +71,59 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe //! \brief Calls exchangeCollDataXGPU27() for exchanging all nodes //! \details Used in the communication after collision step void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, - CudaMemoryManager *cudaMemoryManager, int level, int streamIndex); + CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex); //! \brief Calls exchangeCollDataXGPU27() for exchanging the nodes, which are part of the communication between the two //! interpolation processes on refined grids //! \details Only exchange nodes which are part of the interpolation process on //! refined grids. This function is used in the exchange which takes place after the interpolation fine to coarse and //! before the interpolation coarse to fine. See [master thesis of Anna Wellmann] void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, - CudaMemoryManager *cudaMemoryManager, int level, int streamIndex); + CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex); //! \brief Distribute the receive nodes (x direction) from the buffer on the gpu //! \details Needed to exchange all nodes, used in the communication after collision step -void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex); +void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex); //! \brief Distribute the receive nodes (x direction) from the buffer on the gpu //! \details Only exchange nodes which are part of the interpolation process on refined grids. This function is used in //! the exchange which takes place after the interpolation fine to coarse and before the interpolation coarse to fine. //! See [master thesis of Anna Wellmann] -void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex); +void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex); ////////////////////////////////////////////////////////////////////////// // y -void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, int streamIndex); -void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, int streamIndex); +void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex); +void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex); void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex, + int level,CudaStreamIndex streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHos); void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, - CudaMemoryManager *cudaMemoryManager, int level, int streamIndex); + CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex); void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, - CudaMemoryManager *cudaMemoryManager, int level, int streamIndex); -void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex); -void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex); + CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex); +void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex); +void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex); // z -void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, int streamIndex); -void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, int streamIndex); +void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex); +void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex); void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex, + int level, CudaStreamIndex streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHost); void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, - CudaMemoryManager *cudaMemoryManager, int level, int streamIndex); + CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex); void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, - CudaMemoryManager *cudaMemoryManager, int level, int streamIndex); + CudaMemoryManager *cudaMemoryManager, int level, CudaStreamIndex streamIndex); -void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, int streamIndex); -void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, int streamIndex); +void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, CudaStreamIndex streamIndex); +void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, CudaStreamIndex streamIndex); ////////////////////////////////////////////////////////////////////////// // 3D domain decomposition convection diffusion diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp index 22192216927f91c33fafc23c54c3fae334abdd34..6e7f7e0173873d4f705517238b50fcaed593fceb 100644 --- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp +++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp @@ -524,24 +524,24 @@ void CudaMemoryManager::cudaCopyProcessNeighborXIndex(int lev, unsigned int proc cudaMemcpyHostToDevice)); } void CudaMemoryManager::cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor, - const unsigned int &memsizeFsRecv, int streamIndex) + const unsigned int &memsizeFsRecv) { - if (streamIndex == -1) + if (parameter->getStreamManager()->streamIsRegistered(CudaStreamIndex::Border)) checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].f[0], parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsRecv, cudaMemcpyHostToDevice)); else checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].f[0], - parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0], - parameter->getD3Qxx() * memsizeFsRecv, - cudaMemcpyHostToDevice, - parameter->getStreamManager()->getStream(streamIndex))); + parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0], + parameter->getD3Qxx() * memsizeFsRecv, + cudaMemcpyHostToDevice, + parameter->getStreamManager()->getStream(CudaStreamIndex::Border))); } void CudaMemoryManager::cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor, - const unsigned int &memsizeFsSend, int streamIndex) + const unsigned int &memsizeFsSend) { - if (streamIndex == -1) + if (parameter->getStreamManager()->streamIsRegistered(CudaStreamIndex::Border)) checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborX[processNeighbor].f[0], parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsSend, @@ -551,7 +551,7 @@ void CudaMemoryManager::cudaCopyProcessNeighborXFsDH(int lev, unsigned int proce parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsSend, cudaMemcpyDeviceToHost, - parameter->getStreamManager()->getStream(streamIndex))); + parameter->getStreamManager()->getStream(CudaStreamIndex::Border))); } void CudaMemoryManager::cudaFreeProcessNeighborX(int lev, unsigned int processNeighbor) { @@ -594,35 +594,33 @@ void CudaMemoryManager::cudaCopyProcessNeighborYIndex(int lev, unsigned int proc parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].memsizeIndex, cudaMemcpyHostToDevice)); } -void CudaMemoryManager::cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv, - int streamIndex) +void CudaMemoryManager::cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv) { - if (streamIndex == -1) + if (parameter->getStreamManager()->streamIsRegistered(CudaStreamIndex::Border)) checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0], parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsRecv, cudaMemcpyHostToDevice)); else - checkCudaErrors(cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0], - parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0], - parameter->getD3Qxx() * memsizeFsRecv, - cudaMemcpyHostToDevice, - parameter->getStreamManager()->getStream(streamIndex))); + checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0], + parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0], + parameter->getD3Qxx() * memsizeFsRecv, + cudaMemcpyHostToDevice, + parameter->getStreamManager()->getStream(CudaStreamIndex::Border))); } -void CudaMemoryManager::cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend, - int streamIndex) +void CudaMemoryManager::cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend) { - if (streamIndex == -1) + if (parameter->getStreamManager()->streamIsRegistered(CudaStreamIndex::Border)) checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0], parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsSend, cudaMemcpyDeviceToHost)); else - checkCudaErrors( - cudaMemcpyAsync(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0], - parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0], - parameter->getD3Qxx() * memsizeFsSend, - cudaMemcpyDeviceToHost, parameter->getStreamManager()->getStream(streamIndex))); + checkCudaErrors( cudaMemcpyAsync(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0], + parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0], + parameter->getD3Qxx() * memsizeFsSend, + cudaMemcpyDeviceToHost, + parameter->getStreamManager()->getStream(CudaStreamIndex::Border))); } void CudaMemoryManager::cudaFreeProcessNeighborY(int lev, unsigned int processNeighbor) { @@ -666,9 +664,9 @@ void CudaMemoryManager::cudaCopyProcessNeighborZIndex(int lev, unsigned int proc cudaMemcpyHostToDevice)); } void CudaMemoryManager::cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor, - const unsigned int &memsizeFsRecv, int streamIndex) + const unsigned int &memsizeFsRecv) { - if (streamIndex == -1) + if (parameter->getStreamManager()->streamIsRegistered(CudaStreamIndex::Border)) checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].f[0], parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsRecv, @@ -678,12 +676,12 @@ void CudaMemoryManager::cudaCopyProcessNeighborZFsHD(int lev, unsigned int proce parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsRecv, cudaMemcpyHostToDevice, - parameter->getStreamManager()->getStream(streamIndex))); + parameter->getStreamManager()->getStream(CudaStreamIndex::Border))); } void CudaMemoryManager::cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor, - const unsigned int &memsizeFsSend, int streamIndex) + const unsigned int &memsizeFsSend) { - if (streamIndex == -1) + if (parameter->getStreamManager()->streamIsRegistered(CudaStreamIndex::Border)) checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborZ[processNeighbor].f[0], parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsSend, @@ -693,7 +691,7 @@ void CudaMemoryManager::cudaCopyProcessNeighborZFsDH(int lev, unsigned int proce parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsSend, cudaMemcpyDeviceToHost, - parameter->getStreamManager()->getStream(streamIndex))); + parameter->getStreamManager()->getStream(CudaStreamIndex::Border))); } void CudaMemoryManager::cudaFreeProcessNeighborZ(int lev, unsigned int processNeighbor) { diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h index 8df1b412c760ef3298f4426c206777d5c1db1b54..040d0cf74794947d1325d61b40a10f4bca53f58a 100644 --- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h +++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h @@ -91,26 +91,20 @@ public: ////////////////////////////////////////////////////////////////////////// //3D domain decomposition void cudaAllocProcessNeighborX(int lev, unsigned int processNeighbor); - void cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv, - int streamIndex); - void cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend, - int streamIndex); + void cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv); + void cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend); void cudaCopyProcessNeighborXIndex(int lev, unsigned int processNeighbor); void cudaFreeProcessNeighborX(int lev, unsigned int processNeighbor); // void cudaAllocProcessNeighborY(int lev, unsigned int processNeighbor); - void cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv, - int streamIndex); - void cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend, - int streamIndex); + void cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv); + void cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend); void cudaCopyProcessNeighborYIndex(int lev, unsigned int processNeighbor); void cudaFreeProcessNeighborY(int lev, unsigned int processNeighbor); // void cudaAllocProcessNeighborZ(int lev, unsigned int processNeighbor); - void cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv, - int streamIndex); - void cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend, - int streamIndex); + void cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv); + void cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend); void cudaCopyProcessNeighborZIndex(int lev, unsigned int processNeighbor); void cudaFreeProcessNeighborZ(int lev, unsigned int processNeighbor); diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h index 9f9f7539bc5a1e28612d956ca32234c5a3589f8a..e926d3da3e76bfe8324535d584615915f981c826 100644 --- a/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h +++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h @@ -5,6 +5,7 @@ #include "Kernel/Utilities/KernelGroup.h" #include "PreProcessor/PreProcessorType.h" +#include "Parameter/CudaStreamManager.h" #include <helper_cuda.h> @@ -13,7 +14,7 @@ class Kernel public: virtual ~Kernel() = default; virtual void run() = 0; - virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1) = 0; //if stream == -1: run on default stream + virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices, CudaStreamIndex streamIdx=CudaStreamIndex::Legacy) = 0; //if stream == -1: run on default stream virtual bool checkParameter() = 0; virtual std::vector<PreProcessorType> getPreProcessorTypes() = 0; diff --git a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp index 630aaf7339afc2907ab6bfbf65bd5fc55f75e215..634f6646bf08b546ecc892b11f6cae0bb50aa790 100644 --- a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp +++ b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp @@ -3,7 +3,7 @@ #include "Kernel/Utilities/CheckParameterStrategy/CheckParameterStrategy.h" -void KernelImp::runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream) +void KernelImp::runOnIndices(const unsigned int *indices, unsigned int size_indices, CudaStreamIndex streamIndex) { printf("Method not implemented for this Kernel \n"); } diff --git a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h index 0141ddda7e9579cc84148d26727ed81c084ea0c5..38ca0f9a4f7fee78d64bece43eb735f33567b9ae 100644 --- a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h +++ b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h @@ -9,12 +9,12 @@ class CheckParameterStrategy; class Parameter; - +class CudaStreamManager; class KernelImp : public Kernel { public: virtual void run() = 0; - virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1); + virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices, CudaStreamIndex streamIndex=CudaStreamIndex::Legacy); bool checkParameter(); std::vector<PreProcessorType> getPreProcessorTypes(); diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.cu index 8c06b7117c8b1ef62b932a76bf5de0be2ae99b1c..1ac8b33bec523c607c0e63621a897a56f7965aee 100644 --- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.cu +++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.cu @@ -30,11 +30,10 @@ void CumulantK17CompChimRedesigned::run() getLastCudaError("LB_Kernel_CumulantK17CompChim execution failed"); } -void CumulantK17CompChimRedesigned::runOnIndices(const unsigned int *indices, unsigned int size_indices, int streamIndex) +void CumulantK17CompChimRedesigned::runOnIndices(const unsigned int *indices, unsigned int size_indices, CudaStreamIndex streamIndex) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); - LB_Kernel_CumulantK17CompChimRedesigned<<< cudaGrid.grid, cudaGrid.threads, 0, stream>>>( + LB_Kernel_CumulantK17CompChimRedesigned<<< cudaGrid.grid, cudaGrid.threads, 0, para->getStreamManager()->getStream(streamIndex)>>>( para->getParD(level)->omega, para->getParD(level)->neighborX, para->getParD(level)->neighborY, diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.h index 4658075de330665fdba88a5ec8149a9b476d5ac7..0872b99be50183e8813287f0e70674b9f0c3c20a 100644 --- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.h +++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.h @@ -8,7 +8,7 @@ class CumulantK17CompChimRedesigned : public KernelImp public: static std::shared_ptr<CumulantK17CompChimRedesigned> getNewInstance(std::shared_ptr<Parameter> para, int level); void run() override; - void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1) override; + void runOnIndices(const unsigned int *indices, unsigned int size_indices, CudaStreamIndex streamIndex=CudaStreamIndex::Legacy) override; private: CumulantK17CompChimRedesigned(); diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu index 958ef59fbd7b6d199f68ccc53703a08141cf3f63..93bbd0d7793172577a3ed993fcef92e8b6e6d814 100644 --- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu +++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu @@ -30,10 +30,10 @@ void CumulantK17CompChimStream::run() getLastCudaError("LB_Kernel_CumulantK17CompChim execution failed"); } -void CumulantK17CompChimStream::runOnIndices(const unsigned int *indices, unsigned int size_indices, int streamIndex) +void CumulantK17CompChimStream::runOnIndices(const unsigned int *indices, unsigned int size_indices, CudaStreamIndex streamIndex) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); - + cudaStream_t stream = para->getStreamManager()->getStream(streamIndex); + LB_Kernel_CumulantK17CompChimStream<<< cudaGrid.grid, cudaGrid.threads, 0, stream>>>( para->getParD(level)->omega, para->getParD(level)->neighborX, diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.h index 325826e04c893b7c56b7f00bb2503a4eb1fda441..60ad10bbada776b423243788987e02dff390b56a 100644 --- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.h +++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.h @@ -8,7 +8,7 @@ class CumulantK17CompChimStream : public KernelImp public: static std::shared_ptr<CumulantK17CompChimStream> getNewInstance(std::shared_ptr<Parameter> para, int level); void run() override; - void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1) override; + void runOnIndices(const unsigned int *indices, unsigned int size_indices, CudaStreamIndex streamIndex=CudaStreamIndex::Legacy) override; private: CumulantK17CompChimStream(); diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp index c3129e31a9c750a012a26d58961062eaf3f40add..2b6a266c0d4e5f523091fa4982eee5d83b2ec675 100644 --- a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp +++ b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp @@ -59,8 +59,9 @@ GridScalingKernelManager::GridScalingKernelManager(SPtr<Parameter> parameter, Gr VF_LOG_TRACE("Function for scalingCoarseToFine is nullptr"); } -void GridScalingKernelManager::runFineToCoarseKernelLB(const int level, InterpolationCellFC *icellFC, OffFC &offFC, int streamIndex) const{ - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); +void GridScalingKernelManager::runFineToCoarseKernelLB(const int level, InterpolationCellFC *icellFC, OffFC &offFC, CudaStreamIndex streamIndex) const +{ + cudaStream_t stream = para->getStreamManager()->getStream(streamIndex); this->scalingFineToCoarse(para->getParD(level).get(), para->getParD(level+1).get(), icellFC, offFC, stream); @@ -327,9 +328,9 @@ void GridScalingKernelManager::runFineToCoarseKernelAD(const int level) const } } -void GridScalingKernelManager::runCoarseToFineKernelLB(const int level, InterpolationCellCF* icellCF, OffCF &offCF, int streamIndex) const +void GridScalingKernelManager::runCoarseToFineKernelLB(const int level, InterpolationCellCF* icellCF, OffCF &offCF, CudaStreamIndex streamIndex) const { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(streamIndex); this->scalingCoarseToFine(para->getParD(level).get(), para->getParD(level+1).get(), icellCF, offCF, stream); // ScaleCF_comp_D3Q27F3( diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.h b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.h index 85cdd88ec2e3a6622108026ce8f53c5c770f8afe..3c78ee7f9db254556e8ec6dbbafaf51cd995f10b 100644 --- a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.h +++ b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.h @@ -44,6 +44,7 @@ class Parameter; class CudaMemoryManager; class GridScalingFactory; +enum class CudaStreamIndex; struct LBMSimulationParameter; struct CUstream_st; @@ -62,14 +63,14 @@ public: //! \throws std::runtime_error when the user forgets to specify a scaling function GridScalingKernelManager(SPtr<Parameter> parameter, GridScalingFactory *gridScalingFactory); - //! \brief calls the device function of the fine to coarse grid interpolation kernel - void runFineToCoarseKernelLB(const int level, InterpolationCellFC *icellFC, OffFC &offFC, int streamIndex) const; + //! \brief calls the device function of the fine to coarse grid interpolation kernelH + void runFineToCoarseKernelLB(const int level, InterpolationCellFC *icellFC, OffFC &offFC, CudaStreamIndex streamIndex) const; //! \brief calls the device function of the fine to coarse grid interpolation kernel (advection diffusion) void runFineToCoarseKernelAD(const int level) const; //! \brief calls the device function of the coarse to fine grid interpolation kernel - void runCoarseToFineKernelLB(const int level, InterpolationCellCF *icellCF, OffCF &offCF, int streamIndex) const; + void runCoarseToFineKernelLB(const int level, InterpolationCellCF *icellCF, OffCF &offCF, CudaStreamIndex streamIndex) const; //! \brief calls the device function of the coarse to fine grid interpolation kernel (advection diffusion) void runCoarseToFineKernelAD(const int level) const; diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp index f3638e96768b3ec52e122809facf3242f4a9149a..2881d1e6e4c23efd015254d53694ef8603fc8770 100644 --- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp +++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp @@ -116,7 +116,9 @@ void Simulation::init(GridProvider &gridProvider, BoundaryConditionFactory *bcFa ////////////////////////////////////////////////////////////////////////// // CUDA streams if (para->getUseStreams()) { - para->getStreamManager()->launchStreams(2u); + para->getStreamManager()->registerStream(CudaStreamIndex::Border); + para->getStreamManager()->registerStream(CudaStreamIndex::Bulk); + para->getStreamManager()->launchStreams(); para->getStreamManager()->createCudaEvents(); } ////////////////////////////////////////////////////////////////////////// diff --git a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp index 3731836f336d91c1bc4cc5f1a8f5ea0a10bee0a6..3e9ebb888dccbf36db08161094ce4195afd6bd26 100644 --- a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp +++ b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp @@ -31,25 +31,33 @@ #include <helper_cuda.h> #include <iostream> -void CudaStreamManager::launchStreams(uint numberOfStreams) +void CudaStreamManager::registerStream(CudaStreamIndex streamIndex) { - cudaStreams.resize(numberOfStreams); - for (cudaStream_t &stream : cudaStreams) - cudaStreamCreate(&stream); + if(streamIndex != CudaStreamIndex::Legacy) + cudaStreams.emplace(streamIndex, nullptr); +} +void CudaStreamManager::launchStreams() +{ + for (auto &stream : cudaStreams) + cudaStreamCreate(&stream.second); } void CudaStreamManager::terminateStreams() { - for (cudaStream_t &stream : cudaStreams) - cudaStreamDestroy(stream); + for (auto &stream : cudaStreams) + cudaStreamDestroy(stream.second); } -cudaStream_t &CudaStreamManager::getStream(uint streamIndex) -{ return cudaStreams[streamIndex]; } - -int CudaStreamManager::getBorderStreamIndex() { return borderStreamIndex; } +cudaStream_t &CudaStreamManager::getStream(CudaStreamIndex streamIndex) +{ + if(streamIndex == CudaStreamIndex::Legacy) return legacyStream; + return streamIsRegistered(streamIndex) ? cudaStreams[streamIndex] : legacyStream; +} -int CudaStreamManager::getBulkStreamIndex() { return bulkStreamIndex; } +bool CudaStreamManager::streamIsRegistered(CudaStreamIndex streamIndex) +{ + return cudaStreams.count(streamIndex) > 0; +} void CudaStreamManager::createCudaEvents() { @@ -61,12 +69,12 @@ void CudaStreamManager::destroyCudaEvents() checkCudaErrors(cudaEventDestroy(startBulkKernel)); } -void CudaStreamManager::triggerStartBulkKernel(int streamIndex) +void CudaStreamManager::triggerStartBulkKernel(CudaStreamIndex streamIndex) { checkCudaErrors(cudaEventRecord(startBulkKernel, cudaStreams[streamIndex])); } -void CudaStreamManager::waitOnStartBulkKernelEvent(int streamIndex) +void CudaStreamManager::waitOnStartBulkKernelEvent(CudaStreamIndex streamIndex) { checkCudaErrors(cudaStreamWaitEvent(cudaStreams[streamIndex], startBulkKernel)); } diff --git a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h index c2d515ab5fe9c24388632a7ca9e1e4c78b7f1467..33d09a6b99602277b84b98bf76f06bab1633a6bf 100644 --- a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h +++ b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h @@ -30,32 +30,37 @@ #ifndef STREAM_MANAGER_H #define STREAM_MANAGER_H -#include <vector> -#include "Core/DataTypes.h" - +#include <map> +#include <cuda.h> #include <cuda_runtime.h> - +enum class CudaStreamIndex + { + Legacy, + Bulk, + Border + }; class CudaStreamManager { +public: + private: - std::vector<cudaStream_t> cudaStreams; + std::map<CudaStreamIndex, cudaStream_t> cudaStreams; cudaEvent_t startBulkKernel = NULL; - const int borderStreamIndex = 1; - const int bulkStreamIndex = 0; + cudaStream_t legacyStream = CU_STREAM_LEGACY; + public: - void launchStreams(uint numberOfStreams); + void registerStream(CudaStreamIndex streamIndex); + void launchStreams(); void terminateStreams(); - cudaStream_t &getStream(uint streamIndex); - - int getBorderStreamIndex(); - int getBulkStreamIndex(); + cudaStream_t &getStream(CudaStreamIndex streamIndex); + bool streamIsRegistered(CudaStreamIndex streamIndex); // Events void createCudaEvents(); void destroyCudaEvents(); - void triggerStartBulkKernel(int streamIndex); - void waitOnStartBulkKernelEvent(int strteamIndex); + void triggerStartBulkKernel(CudaStreamIndex streamIndex); + void waitOnStartBulkKernelEvent(CudaStreamIndex streamIndex); }; #endif