diff --git a/src/gpu/VirtualFluids_GPU/Calculation/CollisisionStrategy.cpp b/src/gpu/VirtualFluids_GPU/Calculation/CollisisionStrategy.cpp index 4a14d19c10936f84379f332ef24f081f0ebb0cb7..222aa361839b71aa50cd3c0b1f3e424317f55120 100644 --- a/src/gpu/VirtualFluids_GPU/Calculation/CollisisionStrategy.cpp +++ b/src/gpu/VirtualFluids_GPU/Calculation/CollisisionStrategy.cpp @@ -61,28 +61,25 @@ void CollisionAndExchange_noStreams_oldKernel::operator()(UpdateGrid27 *updateGr void CollisionAndExchange_streams::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level, unsigned int t) { - int borderStreamIndex = para->getStreamManager()->getBorderStreamIndex(); - int bulkStreamIndex = para->getStreamManager()->getBulkStreamIndex(); - //! \details steps: //! //! 1. run collision for nodes which are at the border of the gpus/processes //! updateGrid->collisionUsingIndices(level, t, para->getParD(level)->fluidNodeIndicesBorder, - para->getParD(level)->numberOfFluidNodesBorder, borderStreamIndex); + para->getParD(level)->numberOfFluidNodesBorder); //! 2. prepare the exchange between gpus (collect the send nodes for communication in a buffer on the gpu) and trigger bulk kernel execution when finished //! - updateGrid->prepareExchangeMultiGPU(level, borderStreamIndex); + updateGrid->prepareExchangeMultiGPU(level); if (para->getUseStreams()) - para->getStreamManager()->triggerStartBulkKernel(borderStreamIndex); + para->getStreamManager()->triggerStartBulkKernel(CudaStreamManager::StreamIndex::borderStreamIndex); //! 3. launch the collision kernel for bulk nodes //! - para->getStreamManager()->waitOnStartBulkKernelEvent(bulkStreamIndex); + para->getStreamManager()->waitOnStartBulkKernelEvent(CudaStreamManager::StreamIndex::bulkStreamIndex); updateGrid->collisionUsingIndices(level, t, para->getParD(level)->fluidNodeIndices, - para->getParD(level)->numberOfFluidNodes, bulkStreamIndex); + para->getParD(level)->numberOfFluidNodes); //! 4. exchange information between GPUs - updateGrid->exchangeMultiGPU(level, borderStreamIndex); + updateGrid->exchangeMultiGPU(level); } diff --git a/src/gpu/VirtualFluids_GPU/Calculation/RefinementStrategy.cpp b/src/gpu/VirtualFluids_GPU/Calculation/RefinementStrategy.cpp index 243c87ae2dfbf8100ad6a5e3a5bc2dd3331d0d32..649e5e1b252adfa2253810710d3aa098f54dbe9f 100644 --- a/src/gpu/VirtualFluids_GPU/Calculation/RefinementStrategy.cpp +++ b/src/gpu/VirtualFluids_GPU/Calculation/RefinementStrategy.cpp @@ -38,75 +38,71 @@ void NoRefinement::operator()(UpdateGrid27 *updateGrid, Parameter *para, int lev void RefinementAndExchange_streams_exchangeInterface::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level) { - int borderStreamIndex = para->getStreamManager()->getBorderStreamIndex(); - int bulkStreamIndex = para->getStreamManager()->getBulkStreamIndex(); - //! \details steps: //! //! 1. Interpolation fine to coarse for nodes which are at the border of the gpus/processes //! updateGrid->fineToCoarse(level, para->getParD(level)->intFCBorder.ICellFCC, para->getParD(level)->intFCBorder.ICellFCF, - para->getParD(level)->intFCBorder.kFC, borderStreamIndex); + para->getParD(level)->intFCBorder.kFC); //! 2. prepare the exchange between gpus (collect the send nodes for communication in a buffer on the gpu) and trigger bulk kernel execution when finished //! - updateGrid->prepareExchangeMultiGPUAfterFtoC(level, borderStreamIndex); + updateGrid->prepareExchangeMultiGPUAfterFtoC(level); if (para->getUseStreams()) - para->getStreamManager()->triggerStartBulkKernel(borderStreamIndex); + para->getStreamManager()->triggerStartBulkKernel(CudaStreamManager::StreamIndex::borderStreamIndex); //! 3. launch the bulk kernels for both interpolation processes (fine to coarse and coarse to fine) //! - para->getStreamManager()->waitOnStartBulkKernelEvent(bulkStreamIndex); + para->getStreamManager()->waitOnStartBulkKernelEvent(CudaStreamManager::StreamIndex::bulkStreamIndex); updateGrid->fineToCoarse(level, para->getParD(level)->intFCBulk.ICellFCC, para->getParD(level)->intFCBulk.ICellFCF, - para->getParD(level)->intFCBulk.kFC, bulkStreamIndex); + para->getParD(level)->intFCBulk.kFC); updateGrid->coarseToFine(level, para->getParD(level)->intCFBulk.ICellCFC, para->getParD(level)->intCFBulk.ICellCFF, - para->getParD(level)->intCFBulk.kCF, para->getParD(level)->offCFBulk, bulkStreamIndex); + para->getParD(level)->intCFBulk.kCF, para->getParD(level)->offCFBulk); //! 4. exchange information between GPUs (only nodes which are part of the interpolation) //! - updateGrid->exchangeMultiGPUAfterFtoC(level, borderStreamIndex); + updateGrid->exchangeMultiGPUAfterFtoC(level); // 5. interpolation fine to coarse for nodes which are at the border of the gpus/processes //! updateGrid->coarseToFine(level, para->getParD(level)->intCFBorder.ICellCFC, para->getParD(level)->intCFBorder.ICellCFF, - para->getParD(level)->intCFBorder.kCF, para->getParD(level)->offCF, borderStreamIndex); + para->getParD(level)->intCFBorder.kCF, para->getParD(level)->offCF); cudaDeviceSynchronize(); } -void RefinementAndExchange_streams_exchangeAllNodes::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level){ - int borderStreamIndex = para->getStreamManager()->getBorderStreamIndex(); - int bulkStreamIndex = para->getStreamManager()->getBulkStreamIndex(); +void RefinementAndExchange_streams_exchangeAllNodes::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level) +{ //! \details steps: //! //! 1. interpolation fine to coarse for nodes which are at the border of the gpus/processes //! updateGrid->fineToCoarse(level, para->getParD(level)->intFCBorder.ICellFCC, para->getParD(level)->intFCBorder.ICellFCF, - para->getParD(level)->intFCBorder.kFC, borderStreamIndex); + para->getParD(level)->intFCBorder.kFC); //! 2. prepare the exchange between gpus (collect the send nodes for communication in a buffer on the gpu) and trigger bulk kernel execution when finished //! - updateGrid->prepareExchangeMultiGPU(level, borderStreamIndex); + updateGrid->prepareExchangeMultiGPU(level); if (para->getUseStreams()) - para->getStreamManager()->triggerStartBulkKernel(borderStreamIndex); + para->getStreamManager()->triggerStartBulkKernel(CudaStreamManager::StreamIndex::borderStreamIndex); //! 3. launch the bulk kernels for both interpolation processes (fine to coarse and coarse to fine) //! - para->getStreamManager()->waitOnStartBulkKernelEvent(bulkStreamIndex); + para->getStreamManager()->waitOnStartBulkKernelEvent(CudaStreamManager::StreamIndex::bulkStreamIndex); updateGrid->fineToCoarse(level, para->getParD(level)->intFCBulk.ICellFCC, para->getParD(level)->intFCBulk.ICellFCF, - para->getParD(level)->intFCBulk.kFC, bulkStreamIndex); + para->getParD(level)->intFCBulk.kFC); updateGrid->coarseToFine(level, para->getParD(level)->intCFBulk.ICellCFC, para->getParD(level)->intCFBulk.ICellCFF, - para->getParD(level)->intCFBulk.kCF, para->getParD(level)->offCFBulk, bulkStreamIndex); + para->getParD(level)->intCFBulk.kCF, para->getParD(level)->offCFBulk); //! 4. exchange information between GPUs (all nodes) //! - updateGrid->exchangeMultiGPU(level, borderStreamIndex); + updateGrid->exchangeMultiGPU(level); // 5. interpolation fine to coarse for nodes which are at the border of the gpus/processes //! updateGrid->coarseToFine(level, para->getParD(level)->intCFBorder.ICellCFC, para->getParD(level)->intCFBorder.ICellCFF, - para->getParD(level)->intCFBorder.kCF, para->getParD(level)->offCF, borderStreamIndex); + para->getParD(level)->intCFBorder.kCF, para->getParD(level)->offCF); cudaDeviceSynchronize(); } @@ -117,7 +113,7 @@ void RefinementAndExchange_noStreams_exchangeInterface::operator()(UpdateGrid27 //! //! 1. interpolation fine to coarse //! - updateGrid->fineToCoarse(level, para->getParD(level)->intFC.ICellFCC, para->getParD(level)->intFC.ICellFCF, para->getParD(level)->K_FC, -1); + updateGrid->fineToCoarse(level, para->getParD(level)->intFC.ICellFCC, para->getParD(level)->intFC.ICellFCF, para->getParD(level)->K_FC); //! 2. exchange information between GPUs (only nodes which are part of the interpolation) //! @@ -125,7 +121,7 @@ void RefinementAndExchange_noStreams_exchangeInterface::operator()(UpdateGrid27 //! 3. interpolation coarse to fine updateGrid->coarseToFine(level, para->getParD(level)->intCF.ICellCFC, para->getParD(level)->intCF.ICellCFF, para->getParD(level)->K_CF, - para->getParD(level)->offCF, -1); + para->getParD(level)->offCF); } void RefinementAndExchange_noStreams_exchangeAllNodes::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level) @@ -134,7 +130,7 @@ void RefinementAndExchange_noStreams_exchangeAllNodes::operator()(UpdateGrid27 * //! //! 1. interpolation fine to coarse //! - updateGrid->fineToCoarse(level, para->getParD(level)->intFC.ICellFCC, para->getParD(level)->intFC.ICellFCF, para->getParD(level)->K_FC, -1); + updateGrid->fineToCoarse(level, para->getParD(level)->intFC.ICellFCC, para->getParD(level)->intFC.ICellFCF, para->getParD(level)->K_FC); //! 2. exchange information between GPUs (all nodes) //! @@ -142,7 +138,7 @@ void RefinementAndExchange_noStreams_exchangeAllNodes::operator()(UpdateGrid27 * //! 3. interpolation coarse to fine updateGrid->coarseToFine(level, para->getParD(level)->intCF.ICellCFC, para->getParD(level)->intCF.ICellCFF, para->getParD(level)->K_CF, - para->getParD(level)->offCF, -1); + para->getParD(level)->offCF); } void Refinement_noExchange::operator()(UpdateGrid27 *updateGrid, Parameter *para, int level) @@ -151,8 +147,8 @@ void Refinement_noExchange::operator()(UpdateGrid27 *updateGrid, Parameter *para //! //! 1. interpolation fine to coarse //! - updateGrid->fineToCoarse(level, para->getParD(level)->intFC.ICellFCC, para->getParD(level)->intFC.ICellFCF, para->getParD(level)->K_FC, -1); + updateGrid->fineToCoarse(level, para->getParD(level)->intFC.ICellFCC, para->getParD(level)->intFC.ICellFCF, para->getParD(level)->K_FC); //! 2. interpolation coarse to fine updateGrid->coarseToFine(level, para->getParD(level)->intCF.ICellCFC, para->getParD(level)->intCF.ICellCFF, para->getParD(level)->K_CF, - para->getParD(level)->offCF, -1); + para->getParD(level)->offCF); } diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp index ef5d6c041eee56746818ee219ce3d80cafcf840c..2ca0c4fd50755d82c3250bd12a2aabb71af8efd9 100644 --- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp +++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp @@ -118,31 +118,31 @@ void UpdateGrid27::collisionAdvectionDiffusion(int level) this->adKernelManager->runADcollisionKernel(level); } -void UpdateGrid27::prepareExchangeMultiGPU(int level, int streamIndex) +void UpdateGrid27::prepareExchangeMultiGPU(int level) { - prepareExchangeCollDataXGPU27AllNodes(para.get(), level, streamIndex); - prepareExchangeCollDataYGPU27AllNodes(para.get(), level, streamIndex); - prepareExchangeCollDataZGPU27AllNodes(para.get(), level, streamIndex); + prepareExchangeCollDataXGPU27AllNodes(para.get(), level); + prepareExchangeCollDataYGPU27AllNodes(para.get(), level); + prepareExchangeCollDataZGPU27AllNodes(para.get(), level); } -void UpdateGrid27::prepareExchangeMultiGPUAfterFtoC(int level, int streamIndex) +void UpdateGrid27::prepareExchangeMultiGPUAfterFtoC(int level) { - prepareExchangeCollDataXGPU27AfterFtoC(para.get(), level, streamIndex); - prepareExchangeCollDataYGPU27AfterFtoC(para.get(), level, streamIndex); - prepareExchangeCollDataZGPU27AfterFtoC(para.get(), level, streamIndex); + prepareExchangeCollDataXGPU27AfterFtoC(para.get(), level); + prepareExchangeCollDataYGPU27AfterFtoC(para.get(), level); + prepareExchangeCollDataZGPU27AfterFtoC(para.get(), level); } -void UpdateGrid27::exchangeMultiGPU(int level, int streamIndex) +void UpdateGrid27::exchangeMultiGPU(int level) { ////////////////////////////////////////////////////////////////////////// // 3D domain decomposition - exchangeCollDataXGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, streamIndex); - exchangeCollDataYGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, streamIndex); - exchangeCollDataZGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, streamIndex); + exchangeCollDataXGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level); + exchangeCollDataYGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level); + exchangeCollDataZGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level); - scatterNodesFromRecvBufferXGPU27AllNodes(para.get(), level, streamIndex); - scatterNodesFromRecvBufferYGPU27AllNodes(para.get(), level, streamIndex); - scatterNodesFromRecvBufferZGPU27AllNodes(para.get(), level, streamIndex); + scatterNodesFromRecvBufferXGPU27AllNodes(para.get(), level); + scatterNodesFromRecvBufferYGPU27AllNodes(para.get(), level); + scatterNodesFromRecvBufferZGPU27AllNodes(para.get(), level); ////////////////////////////////////////////////////////////////////////// // 3D domain decomposition convection diffusion @@ -168,30 +168,30 @@ void UpdateGrid27::exchangeMultiGPU_noStreams_withPrepare(int level, bool useRed // 3D domain decomposition if (useReducedComm) { // X - prepareExchangeCollDataXGPU27AfterFtoC(para.get(), level, -1); - exchangeCollDataXGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, -1); - scatterNodesFromRecvBufferXGPU27AfterFtoC(para.get(), level, -1); + prepareExchangeCollDataXGPU27AfterFtoC(para.get(), level); + exchangeCollDataXGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level); + scatterNodesFromRecvBufferXGPU27AfterFtoC(para.get(), level); // Y - prepareExchangeCollDataYGPU27AfterFtoC(para.get(), level, -1); - exchangeCollDataYGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, -1); - scatterNodesFromRecvBufferYGPU27AfterFtoC(para.get(), level, -1); + prepareExchangeCollDataYGPU27AfterFtoC(para.get(), level); + exchangeCollDataYGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level); + scatterNodesFromRecvBufferYGPU27AfterFtoC(para.get(), level); // Z - prepareExchangeCollDataZGPU27AfterFtoC(para.get(), level, -1); - exchangeCollDataZGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, -1); - scatterNodesFromRecvBufferZGPU27AfterFtoC(para.get(), level, -1); + prepareExchangeCollDataZGPU27AfterFtoC(para.get(), level); + exchangeCollDataZGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level); + scatterNodesFromRecvBufferZGPU27AfterFtoC(para.get(), level); } else { // X - prepareExchangeCollDataXGPU27AllNodes(para.get(), level, -1); - exchangeCollDataXGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, -1); - scatterNodesFromRecvBufferXGPU27AllNodes(para.get(), level, -1); + prepareExchangeCollDataXGPU27AllNodes(para.get(), level); + exchangeCollDataXGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level); + scatterNodesFromRecvBufferXGPU27AllNodes(para.get(), level); // Y - prepareExchangeCollDataYGPU27AllNodes(para.get(), level, -1); - exchangeCollDataYGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, -1); - scatterNodesFromRecvBufferYGPU27AllNodes(para.get(), level, -1); + prepareExchangeCollDataYGPU27AllNodes(para.get(), level); + exchangeCollDataYGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level); + scatterNodesFromRecvBufferYGPU27AllNodes(para.get(), level); // Z - prepareExchangeCollDataZGPU27AllNodes(para.get(), level, -1); - exchangeCollDataZGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level, -1); - scatterNodesFromRecvBufferZGPU27AllNodes(para.get(), level, -1); + prepareExchangeCollDataZGPU27AllNodes(para.get(), level); + exchangeCollDataZGPU27AllNodes(para.get(), comm, cudaMemoryManager.get(), level); + scatterNodesFromRecvBufferZGPU27AllNodes(para.get(), level); } ////////////////////////////////////////////////////////////////////////// @@ -204,17 +204,17 @@ void UpdateGrid27::exchangeMultiGPU_noStreams_withPrepare(int level, bool useRed exchangePostCollDataADZGPU27(para.get(), comm, cudaMemoryManager.get(), level); } } -void UpdateGrid27::exchangeMultiGPUAfterFtoC(int level, int streamIndex) +void UpdateGrid27::exchangeMultiGPUAfterFtoC(int level) { ////////////////////////////////////////////////////////////////////////// // 3D domain decomposition - exchangeCollDataXGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, streamIndex); - exchangeCollDataYGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, streamIndex); - exchangeCollDataZGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level, streamIndex); + exchangeCollDataXGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level); + exchangeCollDataYGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level); + exchangeCollDataZGPU27AfterFtoC(para.get(), comm, cudaMemoryManager.get(), level); - scatterNodesFromRecvBufferXGPU27AfterFtoC(para.get(), level, streamIndex); - scatterNodesFromRecvBufferYGPU27AfterFtoC(para.get(), level, streamIndex); - scatterNodesFromRecvBufferZGPU27AfterFtoC(para.get(), level, streamIndex); + scatterNodesFromRecvBufferXGPU27AfterFtoC(para.get(), level); + scatterNodesFromRecvBufferYGPU27AfterFtoC(para.get(), level); + scatterNodesFromRecvBufferZGPU27AfterFtoC(para.get(), level); ////////////////////////////////////////////////////////////////////////// // 3D domain decomposition convection diffusion @@ -322,12 +322,12 @@ void UpdateGrid27::preCollisionBC(int level, unsigned int t) ////////////////////////////////////////////////////////////////////////////////// } -void UpdateGrid27::fineToCoarse(int level, uint *iCellFCC, uint *iCellFCF, uint k_FC, int streamIndex) +void UpdateGrid27::fineToCoarse(int level, uint *iCellFCC, uint *iCellFCF, uint k_FC) { - gridScalingKernelManager->runFineToCoarseKernelLB(level, iCellFCC, iCellFCF, k_FC, streamIndex); + gridScalingKernelManager->runFineToCoarseKernelLB(level, iCellFCC, iCellFCF, k_FC); if (para->getDiffOn()) { - if (streamIndex != -1) { + if (para->getStreamManager()->streamIsRegistered(CudaStreamManager::StreamIndex::borderStreamIndex)) { printf("fineToCoarse Advection Diffusion not implemented"); // TODO return; } @@ -335,14 +335,13 @@ void UpdateGrid27::fineToCoarse(int level, uint *iCellFCC, uint *iCellFCF, uint } } -void UpdateGrid27::coarseToFine(int level, uint *iCellCFC, uint *iCellCFF, uint k_CF, OffCF &offCF, - int streamIndex) +void UpdateGrid27::coarseToFine(int level, uint *iCellCFC, uint *iCellCFF, uint k_CF, OffCF &offCF) { - this->gridScalingKernelManager->runCoarseToFineKernelLB(level, iCellCFC, iCellCFF, k_CF, offCF, streamIndex); + this->gridScalingKernelManager->runCoarseToFineKernelLB(level, iCellCFC, iCellCFF, k_CF, offCF); if (para->getDiffOn()) { - if (streamIndex != -1){ + if (para->getStreamManager()->streamIsRegistered(CudaStreamManager::StreamIndex::borderStreamIndex)){ printf("CoarseToFineWithStream Advection Diffusion not implemented"); // TODO return; } diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h index 576b6af60da2f06cab3c352534e8b60f6b71830e..76946bb82a898ae8f779c2b002047b65809a6ddb 100644 --- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h +++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h @@ -38,14 +38,14 @@ private: void preCollisionBC(int level, unsigned int t); void collisionPorousMedia(int level); - void fineToCoarse(int level, uint *iCellFCC, uint *iCellFCF, uint k_FC, int streamIndex); - void coarseToFine(int level, uint *iCellCFC, uint *iCellCFF, uint k_CF, OffCF &offCF, int streamIndex); + void fineToCoarse(int level, uint *iCellFCC, uint *iCellFCF, uint k_FC); + void coarseToFine(int level, uint *iCellCFC, uint *iCellCFF, uint k_CF, OffCF &offCF); - void prepareExchangeMultiGPU(int level, int streamIndex); - void prepareExchangeMultiGPUAfterFtoC(int level, int streamIndex); + void prepareExchangeMultiGPU(int level); + void prepareExchangeMultiGPUAfterFtoC(int level); - void exchangeMultiGPU(int level, int streamIndex); - void exchangeMultiGPUAfterFtoC(int level, int streamIndex); + void exchangeMultiGPU(int level); + void exchangeMultiGPUAfterFtoC(int level); void exchangeMultiGPU_noStreams_withPrepare(int level, bool useReducedComm); void swapBetweenEvenAndOddTimestep(int level); diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp index 8986837b502dd6a8b1ec6d8be06318eb4c4d6cc8..6ffbf9a09b7b7d50dfdddd698e3b2fb6846e07b8 100644 --- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp +++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp @@ -11,11 +11,11 @@ using namespace vf::lbm::dir; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // 3D domain decomposition: functions used by all directions //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex, +void collectNodesInSendBufferGPU(Parameter *para, int level, std::vector<ProcessNeighbor27> *sendProcessNeighbor, unsigned int numberOfSendProcessNeighbors) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex); for (unsigned int i = 0; i < numberOfSendProcessNeighbors; i++) { GetSendFsPostDev27(para->getParD(level)->distributions.f[0], @@ -32,11 +32,11 @@ void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex, } } -void scatterNodesFromRecvBufferGPU(Parameter *para, int level, int streamIndex, +void scatterNodesFromRecvBufferGPU(Parameter *para, int level, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, unsigned int numberOfRecvProcessNeighbors) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex); for (unsigned int i = 0; i < numberOfRecvProcessNeighbors; i++) { SetRecvFsPostDev27(para->getParD(level)->distributions.f[0], (*recvProcessNeighborDev)[i].f[0], @@ -105,22 +105,22 @@ void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeN //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // X //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, int streamIndex) +void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level) { - collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborX, + collectNodesInSendBufferGPU(para, level, ¶->getParD(level)->sendProcessNeighborX, (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); } -void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level) { - collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborsAfterFtoCX, + collectNodesInSendBufferGPU(para, level, ¶->getParD(level)->sendProcessNeighborsAfterFtoCX, (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); } void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex) + int level) { - exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, streamIndex, + exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, ¶->getParD(level)->sendProcessNeighborX, ¶->getParD(level)->recvProcessNeighborX, ¶->getParH(level)->sendProcessNeighborX, @@ -128,41 +128,40 @@ void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm } void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex) + int level) { - exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, streamIndex, + exchangeCollDataXGPU27(para, comm, cudaMemoryManager, level, ¶->getParD(level)->sendProcessNeighborsAfterFtoCX, ¶->getParD(level)->recvProcessNeighborsAfterFtoCX, ¶->getParH(level)->sendProcessNeighborsAfterFtoCX, ¶->getParH(level)->recvProcessNeighborsAfterFtoCX); } -void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level) { - scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborX, + scatterNodesFromRecvBufferGPU(para, level, ¶->getParD(level)->recvProcessNeighborX, (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); } -void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level) { - scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborsAfterFtoCX, + scatterNodesFromRecvBufferGPU(para, level, ¶->getParD(level)->recvProcessNeighborsAfterFtoCX, (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); } void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level, - int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHost) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //! \details steps: //! 1. copy data from device to host for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - cudaMemoryManager->cudaCopyProcessNeighborXFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); + cudaMemoryManager->cudaCopyProcessNeighborXFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //! 2. start non-blocking receive (MPI) @@ -182,7 +181,7 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //! 7. copy received data from host to device for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) - cudaMemoryManager->cudaCopyProcessNeighborXFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); + cudaMemoryManager->cudaCopyProcessNeighborXFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -190,22 +189,22 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Y //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, int streamIndex) +void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level) { - collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborY, + collectNodesInSendBufferGPU(para, level, ¶->getParD(level)->sendProcessNeighborY, (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); } -void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level) { - collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborsAfterFtoCY, + collectNodesInSendBufferGPU(para, level, ¶->getParD(level)->sendProcessNeighborsAfterFtoCY, (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); } void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex) + int level) { - exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, streamIndex, + exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, ¶->getParD(level)->sendProcessNeighborY, ¶->getParD(level)->recvProcessNeighborY, ¶->getParH(level)->sendProcessNeighborY, @@ -213,38 +212,38 @@ void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm } void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex) + int level) { - exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, streamIndex, + exchangeCollDataYGPU27(para, comm, cudaMemoryManager, level, ¶->getParD(level)->sendProcessNeighborsAfterFtoCY, ¶->getParD(level)->recvProcessNeighborsAfterFtoCY, ¶->getParH(level)->sendProcessNeighborsAfterFtoCY, ¶->getParH(level)->recvProcessNeighborsAfterFtoCY); } -void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level) { - scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborY, + scatterNodesFromRecvBufferGPU(para, level, ¶->getParD(level)->recvProcessNeighborY, (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); } -void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level) { - scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborsAfterFtoCY, + scatterNodesFromRecvBufferGPU(para, level, ¶->getParD(level)->recvProcessNeighborsAfterFtoCY, (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); } void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level, - int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, + std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHost) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // copy Device to Host for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - cudaMemoryManager->cudaCopyProcessNeighborYFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); + cudaMemoryManager->cudaCopyProcessNeighborYFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost); @@ -277,7 +276,7 @@ void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // copy Host to Device for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) { - cudaMemoryManager->cudaCopyProcessNeighborYFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); + cudaMemoryManager->cudaCopyProcessNeighborYFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } @@ -286,61 +285,61 @@ void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Z //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, int streamIndex) +void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level) { - collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborZ, + collectNodesInSendBufferGPU(para, level, ¶->getParD(level)->sendProcessNeighborZ, (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); } -void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level) { - collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborsAfterFtoCZ, + collectNodesInSendBufferGPU(para, level, ¶->getParD(level)->sendProcessNeighborsAfterFtoCZ, (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); } void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex) + int level) { - exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, streamIndex, + exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, ¶->getParD(level)->sendProcessNeighborZ, ¶->getParD(level)->recvProcessNeighborZ, ¶->getParH(level)->sendProcessNeighborZ, ¶->getParH(level)->recvProcessNeighborZ); } void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex) + int level) { - exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, streamIndex, + exchangeCollDataZGPU27(para, comm, cudaMemoryManager, level, ¶->getParD(level)->sendProcessNeighborsAfterFtoCZ, ¶->getParD(level)->recvProcessNeighborsAfterFtoCZ, ¶->getParH(level)->sendProcessNeighborsAfterFtoCZ, ¶->getParH(level)->recvProcessNeighborsAfterFtoCZ); } -void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level) { - scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborZ, + scatterNodesFromRecvBufferGPU(para, level, ¶->getParD(level)->recvProcessNeighborZ, (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); } -void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level) { - scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborsAfterFtoCZ, + scatterNodesFromRecvBufferGPU(para, level, ¶->getParD(level)->recvProcessNeighborsAfterFtoCZ, (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, int level, - int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, + std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHost) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // copy Device to Host for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) - cudaMemoryManager->cudaCopyProcessNeighborZFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); + cudaMemoryManager->cudaCopyProcessNeighborZFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost); ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -387,7 +386,7 @@ void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe // copy Host to Device for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) { - cudaMemoryManager->cudaCopyProcessNeighborZFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); + cudaMemoryManager->cudaCopyProcessNeighborZFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h index ec930ebbc06554e948204b74e79e0e25b85f57b5..a9e945e61d0b44b4bc8ca5278ca6408b72c45674 100644 --- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h +++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h @@ -24,11 +24,11 @@ void exchangePostCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, Cud // functions used for all directions //! \brief Collect the send nodes in a buffer on the gpu -void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex, +void collectNodesInSendBufferGPU(Parameter *para, int level, std::vector<ProcessNeighbor27> *sendProcessNeighbor, unsigned int numberOfSendProcessNeighbors); //! \brief Distribute the receive nodes from the buffer on the gpu -void scatterNodesFromRecvBufferGPU(Parameter *para, int level, int streamIndex, +void scatterNodesFromRecvBufferGPU(Parameter *para, int level, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, unsigned int numberOfRecvProcessNeighbors); //! \brief Copy nodes which are part of the communication in multiple directions @@ -49,21 +49,20 @@ void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeN //! \brief Collect the send nodes for communication in the x direction in a buffer on the gpu //! \details Needed to exchange all nodes, used in the communication after collision step -void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, int streamIndex); +void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level); //! \brief Collect the send nodes for communication in the x direction in a buffer on the gpu //! \details Only exchange nodes which are part of the interpolation process on refined grids. This function is used in //! the exchange which takes place after the interpolation fine to coarse and before the interpolation coarse to fine. //! See [master thesis of Anna Wellmann] -void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, int streamIndex); +void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level); //! \brief Exchange routine in x direction for simulations on multiple gpus //! \details Send and receive the nodes from the communication buffers on the gpus. //! \param Communicator is needed for the communication between the processes with mpi //! \param CudaMemoryManager is needed for moving the data between host and device -//! \param streamIndex is the index of a CUDA Stream, which is needed for communication hiding //! \param sendProcessNeighborDev, recvProcessNeighborDev, sendProcessNeighborHost, recvProcessNeighborHost are pointers //! to the send and receive arrays, both on the device and the host void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex, + int level, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, @@ -71,59 +70,59 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMe //! \brief Calls exchangeCollDataXGPU27() for exchanging all nodes //! \details Used in the communication after collision step void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, - CudaMemoryManager *cudaMemoryManager, int level, int streamIndex); + CudaMemoryManager *cudaMemoryManager, int level); //! \brief Calls exchangeCollDataXGPU27() for exchanging the nodes, which are part of the communication between the two //! interpolation processes on refined grids //! \details Only exchange nodes which are part of the interpolation process on //! refined grids. This function is used in the exchange which takes place after the interpolation fine to coarse and //! before the interpolation coarse to fine. See [master thesis of Anna Wellmann] void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, - CudaMemoryManager *cudaMemoryManager, int level, int streamIndex); + CudaMemoryManager *cudaMemoryManager, int level); //! \brief Distribute the receive nodes (x direction) from the buffer on the gpu //! \details Needed to exchange all nodes, used in the communication after collision step -void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex); +void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level); //! \brief Distribute the receive nodes (x direction) from the buffer on the gpu //! \details Only exchange nodes which are part of the interpolation process on refined grids. This function is used in //! the exchange which takes place after the interpolation fine to coarse and before the interpolation coarse to fine. //! See [master thesis of Anna Wellmann] -void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex); +void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level); ////////////////////////////////////////////////////////////////////////// // y -void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, int streamIndex); -void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, int streamIndex); +void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level); +void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level); void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex, + int level, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHos); void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, - CudaMemoryManager *cudaMemoryManager, int level, int streamIndex); + CudaMemoryManager *cudaMemoryManager, int level); void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, - CudaMemoryManager *cudaMemoryManager, int level, int streamIndex); -void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex); -void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex); + CudaMemoryManager *cudaMemoryManager, int level); +void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level); +void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level); // z -void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, int streamIndex); -void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, int streamIndex); +void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level); +void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level); void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaMemoryManager, - int level, int streamIndex, + int level, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHost); void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, - CudaMemoryManager *cudaMemoryManager, int level, int streamIndex); + CudaMemoryManager *cudaMemoryManager, int level); void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, - CudaMemoryManager *cudaMemoryManager, int level, int streamIndex); + CudaMemoryManager *cudaMemoryManager, int level); -void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, int streamIndex); -void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, int streamIndex); +void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level); +void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level); ////////////////////////////////////////////////////////////////////////// // 3D domain decomposition convection diffusion diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp index b37699962ab92963837bdb191554940a4889329c..129430cf48efb6c419317781c5e40ba76a3bf19e 100644 --- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp +++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp @@ -528,24 +528,24 @@ void CudaMemoryManager::cudaCopyProcessNeighborXIndex(int lev, unsigned int proc cudaMemcpyHostToDevice)); } void CudaMemoryManager::cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor, - const unsigned int &memsizeFsRecv, int streamIndex) + const unsigned int &memsizeFsRecv) { - if (streamIndex == -1) + if (parameter->getStreamManager()->streamIsRegistered(CudaStreamManager::StreamIndex::borderStreamIndex)) checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].f[0], parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsRecv, cudaMemcpyHostToDevice)); else checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].f[0], - parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0], - parameter->getD3Qxx() * memsizeFsRecv, - cudaMemcpyHostToDevice, - parameter->getStreamManager()->getStream(streamIndex))); + parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0], + parameter->getD3Qxx() * memsizeFsRecv, + cudaMemcpyHostToDevice, + parameter->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex))); } void CudaMemoryManager::cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor, - const unsigned int &memsizeFsSend, int streamIndex) + const unsigned int &memsizeFsSend) { - if (streamIndex == -1) + if (parameter->getStreamManager()->streamIsRegistered(CudaStreamManager::StreamIndex::borderStreamIndex)) checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborX[processNeighbor].f[0], parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsSend, @@ -555,7 +555,7 @@ void CudaMemoryManager::cudaCopyProcessNeighborXFsDH(int lev, unsigned int proce parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsSend, cudaMemcpyDeviceToHost, - parameter->getStreamManager()->getStream(streamIndex))); + parameter->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex))); } void CudaMemoryManager::cudaFreeProcessNeighborX(int lev, unsigned int processNeighbor) { @@ -598,35 +598,33 @@ void CudaMemoryManager::cudaCopyProcessNeighborYIndex(int lev, unsigned int proc parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].memsizeIndex, cudaMemcpyHostToDevice)); } -void CudaMemoryManager::cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv, - int streamIndex) +void CudaMemoryManager::cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv) { - if (streamIndex == -1) + if (parameter->getStreamManager()->streamIsRegistered(CudaStreamManager::StreamIndex::borderStreamIndex)) checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0], parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsRecv, cudaMemcpyHostToDevice)); else - checkCudaErrors(cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0], - parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0], - parameter->getD3Qxx() * memsizeFsRecv, - cudaMemcpyHostToDevice, - parameter->getStreamManager()->getStream(streamIndex))); + checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0], + parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0], + parameter->getD3Qxx() * memsizeFsRecv, + cudaMemcpyHostToDevice, + parameter->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex))); } -void CudaMemoryManager::cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend, - int streamIndex) +void CudaMemoryManager::cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend) { - if (streamIndex == -1) + if (parameter->getStreamManager()->streamIsRegistered(CudaStreamManager::StreamIndex::borderStreamIndex)) checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0], parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsSend, cudaMemcpyDeviceToHost)); else - checkCudaErrors( - cudaMemcpyAsync(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0], - parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0], - parameter->getD3Qxx() * memsizeFsSend, - cudaMemcpyDeviceToHost, parameter->getStreamManager()->getStream(streamIndex))); + checkCudaErrors( cudaMemcpyAsync(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0], + parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0], + parameter->getD3Qxx() * memsizeFsSend, + cudaMemcpyDeviceToHost, + parameter->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex))); } void CudaMemoryManager::cudaFreeProcessNeighborY(int lev, unsigned int processNeighbor) { @@ -670,9 +668,9 @@ void CudaMemoryManager::cudaCopyProcessNeighborZIndex(int lev, unsigned int proc cudaMemcpyHostToDevice)); } void CudaMemoryManager::cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor, - const unsigned int &memsizeFsRecv, int streamIndex) + const unsigned int &memsizeFsRecv) { - if (streamIndex == -1) + if (parameter->getStreamManager()->streamIsRegistered(CudaStreamManager::StreamIndex::borderStreamIndex)) checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].f[0], parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsRecv, @@ -682,12 +680,12 @@ void CudaMemoryManager::cudaCopyProcessNeighborZFsHD(int lev, unsigned int proce parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsRecv, cudaMemcpyHostToDevice, - parameter->getStreamManager()->getStream(streamIndex))); + parameter->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex))); } void CudaMemoryManager::cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor, - const unsigned int &memsizeFsSend, int streamIndex) + const unsigned int &memsizeFsSend) { - if (streamIndex == -1) + if (parameter->getStreamManager()->streamIsRegistered(CudaStreamManager::StreamIndex::borderStreamIndex)) checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborZ[processNeighbor].f[0], parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsSend, @@ -697,7 +695,7 @@ void CudaMemoryManager::cudaCopyProcessNeighborZFsDH(int lev, unsigned int proce parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].f[0], parameter->getD3Qxx() * memsizeFsSend, cudaMemcpyDeviceToHost, - parameter->getStreamManager()->getStream(streamIndex))); + parameter->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex))); } void CudaMemoryManager::cudaFreeProcessNeighborZ(int lev, unsigned int processNeighbor) { @@ -1738,7 +1736,7 @@ void CudaMemoryManager::cudaCopyPrecursorBC(int lev) void CudaMemoryManager::cudaCopyPrecursorData(int lev) { auto prec = ¶meter->getParH(lev)->precursorBC; - auto precStream = parameter->getStreamManager()->getStream(parameter->getStreamManager()->getPrecursorStreamIndex()); + auto precStream = parameter->getStreamManager()->getStream(CudaStreamManager::StreamIndex::precursorStream); size_t memSize = prec->numberOfPrecursorNodes*sizeof(real)*prec->numberOfQuantities; checkCudaErrors( cudaStreamSynchronize(precStream) ); checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->precursorBC.next, prec->next, memSize, cudaMemcpyHostToDevice, precStream) ); diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h index 11aac5c729e66fa8fae708fddbf7a541c33536e4..f8c6d82fbac53ddb7e9901c6be0f6371619f4b74 100644 --- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h +++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h @@ -93,26 +93,20 @@ public: ////////////////////////////////////////////////////////////////////////// //3D domain decomposition void cudaAllocProcessNeighborX(int lev, unsigned int processNeighbor); - void cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv, - int streamIndex); - void cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend, - int streamIndex); + void cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv); + void cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend); void cudaCopyProcessNeighborXIndex(int lev, unsigned int processNeighbor); void cudaFreeProcessNeighborX(int lev, unsigned int processNeighbor); // void cudaAllocProcessNeighborY(int lev, unsigned int processNeighbor); - void cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv, - int streamIndex); - void cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend, - int streamIndex); + void cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv); + void cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend); void cudaCopyProcessNeighborYIndex(int lev, unsigned int processNeighbor); void cudaFreeProcessNeighborY(int lev, unsigned int processNeighbor); // void cudaAllocProcessNeighborZ(int lev, unsigned int processNeighbor); - void cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv, - int streamIndex); - void cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend, - int streamIndex); + void cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv); + void cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend); void cudaCopyProcessNeighborZIndex(int lev, unsigned int processNeighbor); void cudaFreeProcessNeighborZ(int lev, unsigned int processNeighbor); diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.cu index 8c06b7117c8b1ef62b932a76bf5de0be2ae99b1c..019bb01e944c96a08374ac45f8420ac4e7d3efe8 100644 --- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.cu +++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimRedesigned/CumulantK17CompChimRedesigned.cu @@ -32,7 +32,7 @@ void CumulantK17CompChimRedesigned::run() void CumulantK17CompChimRedesigned::runOnIndices(const unsigned int *indices, unsigned int size_indices, int streamIndex) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamManager::StreamIndex::bulkStreamIndex); LB_Kernel_CumulantK17CompChimRedesigned<<< cudaGrid.grid, cudaGrid.threads, 0, stream>>>( para->getParD(level)->omega, diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu index 6fae9f6d4845019afd363790eea0ee17c69a060f..b90a06edad1c671a23ffef6fcb2d06c74ad23ed8 100644 --- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu +++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu @@ -32,7 +32,7 @@ void CumulantK17CompChimStream::run() void CumulantK17CompChimStream::runOnIndices(const unsigned int *indices, unsigned int size_indices, int streamIndex) { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamManager::StreamIndex::bulkStreamIndex); LB_Kernel_CumulantK17CompChimStream<<< cudaGrid.grid, cudaGrid.threads, 0, stream>>>( para->getParD(level)->omega, diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp index 99c200ae9ebf128c1609059a43e5998612e3527e..f2f8129e2cd719b336a9971a48c5c7d9d95371d5 100644 --- a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp +++ b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp @@ -39,9 +39,9 @@ GridScalingKernelManager::GridScalingKernelManager(SPtr<Parameter> parameter): para(parameter){} -void GridScalingKernelManager::runFineToCoarseKernelLB(const int level, uint *iCellFCC, uint *iCellFCF, uint k_FC, int streamIndex) const{ +void GridScalingKernelManager::runFineToCoarseKernelLB(const int level, uint *iCellFCC, uint *iCellFCF, uint k_FC) const{ - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex); // ScaleFC_comp_D3Q27F3( // para->getParD(level)->distributions.f[0], @@ -332,9 +332,9 @@ void GridScalingKernelManager::runFineToCoarseKernelAD(const int level) const } } -void GridScalingKernelManager::runCoarseToFineKernelLB(const int level, uint *iCellCFC, uint *iCellCFF, uint k_CF, OffCF &offCF, int streamIndex) const +void GridScalingKernelManager::runCoarseToFineKernelLB(const int level, uint *iCellCFC, uint *iCellCFF, uint k_CF, OffCF &offCF) const { - cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); + cudaStream_t stream = para->getStreamManager()->getStream(CudaStreamManager::StreamIndex::borderStreamIndex); // ScaleCF_comp_D3Q27F3( // para->getParD(level)->distributions.f[0], diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.h b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.h index a0f8e15ed3e1c86bda2496cd0a0819a180bba968..d0cd4ab862d1564edefce0a0b15813355d5d8a9e 100644 --- a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.h +++ b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.h @@ -49,14 +49,13 @@ public: GridScalingKernelManager(SPtr<Parameter> parameter); //! \brief calls the device function of the fine to coarse grid interpolation kernel - void runFineToCoarseKernelLB(const int level, uint *iCellFCC, uint *iCellFCF, uint k_FC, int streamIndex) const; + void runFineToCoarseKernelLB(const int level, uint *iCellFCC, uint *iCellFCF, uint k_FC) const; //! \brief calls the device function of the fine to coarse grid interpolation kernel (advection diffusion) void runFineToCoarseKernelAD(const int level) const; //! \brief calls the device function of the coarse to fine grid interpolation kernel - void runCoarseToFineKernelLB(const int level, uint *iCellCFC, uint *iCellCFF, uint k_CF, OffCF &offCF, - int streamIndex) const; + void runCoarseToFineKernelLB(const int level, uint *iCellCFC, uint *iCellCFF, uint k_CF, OffCF &offCF) const; //! \brief calls the device function of the coarse to fine grid interpolation kernel (advection diffusion) void runCoarseToFineKernelAD(const int level) const; diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp index d31ef0cb2a2a28c20e07207c75c9ef9ef771f853..d2219445b21634037841bc9c5a4a1f67ce372d25 100644 --- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp +++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp @@ -115,7 +115,9 @@ void Simulation::init(GridProvider &gridProvider, BoundaryConditionFactory *bcFa ////////////////////////////////////////////////////////////////////////// // CUDA streams if (para->getUseStreams()) { - para->getStreamManager()->launchStreams(3u); + para->getStreamManager()->registerStream(CudaStreamManager::StreamIndex::bulkStreamIndex); + para->getStreamManager()->registerStream(CudaStreamManager::StreamIndex::borderStreamIndex); + para->getStreamManager()->launchStreams(); para->getStreamManager()->createCudaEvents(); } ////////////////////////////////////////////////////////////////////////// diff --git a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp index e44c0885b44b673f3c666bed6ee0b20bce436db5..f47f48524dab98a649c736e5945848719d9a2e84 100644 --- a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp +++ b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp @@ -29,29 +29,37 @@ //======================================================================================= #include "CudaStreamManager.h" #include <helper_cuda.h> +#include <cuda_runtime.h> +#include <cuda.h> #include <iostream> -void CudaStreamManager::launchStreams(uint numberOfStreams) +void CudaStreamManager::registerStream(StreamIndex streamIndex) { - cudaStreams.resize(numberOfStreams); - for (cudaStream_t &stream : cudaStreams) - cudaStreamCreate(&stream); + cudaStreams.emplace(streamIndex, nullptr); } -void CudaStreamManager::terminateStreams() +void CudaStreamManager::launchStreams() { - for (cudaStream_t &stream : cudaStreams) - cudaStreamDestroy(stream); + for (auto &stream : cudaStreams) + cudaStreamCreate(&stream.second); } -cudaStream_t &CudaStreamManager::getStream(uint streamIndex) -{ return cudaStreams[streamIndex]; } +void CudaStreamManager::terminateStreams() +{ + for (auto &stream : cudaStreams) + cudaStreamDestroy(stream.second); +} -int CudaStreamManager::getBorderStreamIndex() { return borderStreamIndex; } +bool CudaStreamManager::streamIsRegistered(StreamIndex streamIndex) +{ + return cudaStreams.find(streamIndex) != cudaStreams.end(); +} -int CudaStreamManager::getBulkStreamIndex() { return bulkStreamIndex; } +cudaStream_t &CudaStreamManager::getStream(StreamIndex streamIndex) +{ + return streamIsRegistered(streamIndex) ? cudaStreams[streamIndex] : legacyStream; +} -int CudaStreamManager::getPrecursorStreamIndex() { return precursorStreamIndex; } void CudaStreamManager::createCudaEvents() { @@ -63,12 +71,12 @@ void CudaStreamManager::destroyCudaEvents() checkCudaErrors(cudaEventDestroy(startBulkKernel)); } -void CudaStreamManager::triggerStartBulkKernel(int streamIndex) +void CudaStreamManager::triggerStartBulkKernel(StreamIndex streamIndex) { checkCudaErrors(cudaEventRecord(startBulkKernel, cudaStreams[streamIndex])); } -void CudaStreamManager::waitOnStartBulkKernelEvent(int streamIndex) +void CudaStreamManager::waitOnStartBulkKernelEvent(StreamIndex streamIndex) { checkCudaErrors(cudaStreamWaitEvent(cudaStreams[streamIndex], startBulkKernel)); } diff --git a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h index d595fdce4155310b59bf5ae4fcbf008817611c4a..638df9cbafed7f249754b1c30fb93c9d68b9d732 100644 --- a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h +++ b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h @@ -32,31 +32,35 @@ #include <vector> #include "Core/DataTypes.h" - +#include <map> #include <cuda_runtime.h> +#include <cuda.h> class CudaStreamManager { +public: + enum StreamIndex{ + precursorStream, + borderStreamIndex, + bulkStreamIndex + }; private: - std::vector<cudaStream_t> cudaStreams; + cudaStream_t legacyStream = CU_STREAM_LEGACY; + std::map<StreamIndex, cudaStream_t> cudaStreams; cudaEvent_t startBulkKernel = NULL; - const int precursorStreamIndex = 2; - const int borderStreamIndex = 1; - const int bulkStreamIndex = 0; public: - void launchStreams(uint numberOfStreams); + void registerStream(StreamIndex streamIndex); + void launchStreams(); + bool streamIsRegistered(StreamIndex streamIndex); void terminateStreams(); - cudaStream_t &getStream(uint streamIndex); - int getBorderStreamIndex(); - int getBulkStreamIndex(); - int getPrecursorStreamIndex(); + cudaStream_t &getStream(StreamIndex streamIndex); // Events void createCudaEvents(); void destroyCudaEvents(); - void triggerStartBulkKernel(int streamIndex); - void waitOnStartBulkKernelEvent(int strteamIndex); + void triggerStartBulkKernel(StreamIndex streamIndex); + void waitOnStartBulkKernelEvent(StreamIndex streamIndex); }; #endif