diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp index 167a0ed3fa3bc47a176528026214ca2224a48842..d1bedf5127470ca969b649ca8d73339400ff6aaa 100644 --- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp +++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp @@ -89,6 +89,7 @@ void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeN continue; } + // copy fs for all directions for (int direction = 0; direction <= (int)dirEND; direction++) { (sendProcessNeighborHost[indexInSubdomainSend].f[0] + (direction * numNodesInBufferSend))[edgeNodes[i].indexInSendBuffer] = @@ -154,25 +155,29 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe { cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // copy Device to Host + //! \details steps: + + //! 1. copy data from device to host for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) cudaManager->cudaCopyProcessNeighborXFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //! 2. start non-blocking receive (MPI) startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost); ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // wait for memcopy device to host to finish before sending data + //! 3. before sending data, wait for memcopy (from device to host) to finish if (para->getUseStreams()) cudaStreamSynchronize(stream); ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + //! 4. send data to neighboring process (MPI) startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // wait + //! 5. wait for until data is received for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) comm->waitGPU(i); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // reset the request array + //! 6. reset the request array, which was used for the mpi communication if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) comm->resetRequest(); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // copy Host to Device + //! 7. copy received data from host to device for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) cudaManager->cudaCopyProcessNeighborXFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h index d57a9dad95d74a77b8e387811ab9d0262072add8..381938fedf9c0ceb79286df1c91ba7d312e312b9 100644 --- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h +++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h @@ -1,52 +1,96 @@ #ifndef EXCHANGEDATA27_H #define EXCHANGEDATA27_H -#include "LBM/LB.h" -#include "GPU/GPU_Interface.h" -#include "Parameter/Parameter.h" #include "Communication/Communicator.h" #include "GPU/CudaMemoryManager.h" +#include "GPU/GPU_Interface.h" +#include "LBM/LB.h" +#include "Parameter/Parameter.h" + +//! \file ExchangeData27.h +//! \ingroup GPU +//! \author Martin Schoenherr, Anna Wellmann +//! \brief routines for data exchange when running simulations on multiple GPUs ////////////////////////////////////////////////////////////////////////// -//1D domain decomposition -extern "C" void exchangePreCollDataGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); -extern "C" void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); +// 1D domain decomposition +extern "C" void exchangePreCollDataGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, + int level); +extern "C" void exchangePostCollDataGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, + int level); ////////////////////////////////////////////////////////////////////////// -//3D domain decomposition +// 3D domain decomposition -// functions used for alll direction +// functions used for all directions + +//! \brief collect the send nodes in a buffer on the gpu extern "C" void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex, - std::vector<ProcessNeighbor27> *sendProcessNeighbor, - unsigned int numberOfSendProcessNeighbors); + std::vector<ProcessNeighbor27> *sendProcessNeighbor, + unsigned int numberOfSendProcessNeighbors); +//! \brief distribute the receive nodes from the buffer on the gpu extern "C" void scatterNodesFromRecvBufferGPU(Parameter *para, int level, int streamIndex, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, unsigned int numberOfRecvProcessNeighbors); -extern "C" void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeNodes, +//! \brief copy nodes which are part of the communication in multiple directions +//! \details The nodes are copied from the receive buffer in one direction to the send buffer in another direction. The +//! copy operation is conducted on the cpu. +//! \ref see master thesis of Anna Wellmann (p. 56f: "Communication Hiding bei +//! der Verwendung eines uniformen Simulationsgitters") +//! \param edgeNodes determines from where to where the nodes are +//! copied +//! \param recvProcessNeighborHost is a reference to the receive buffer on the host, nodes are copied from here +//! \param sendProcessNeighborHost is a reference to the send buffer on the host, nodes are copied to here +extern "C" void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeNodes, std::vector<ProcessNeighbor27> &recvProcessNeighborHost, std::vector<ProcessNeighbor27> &sendProcessNeighborHost); +////////////////////////////////////////////////////////////////////////// // x + +//! \brief collect the send nodes for communication in the x direction in a buffer on the gpu +//! \details needed to exchange all nodes, used in the communication after collision step extern "C" void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, int streamIndex); +//! \brief collect the send nodes for communication in the x direction in a buffer on the gpu +//! \details Only exchange nodes which are part of the interpolation process on refined grids. This function is used in +//! the exchange which takes place after the interpolation fine to coarse and before the interpolation coarse to fine. +//! \ref see master thesis of Anna Wellmann extern "C" void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, int streamIndex); - +//! \brief exchange routine in x direction for simulations on multiple gpus +//! \details send and receive the nodes from the communication buffers on the gpus +//! \param Communicator is needed for the communication between the processes with mpi +//! \param CudaMemoryManager is needed for moving the data between host and device +//! \param streamIndex is the index of a CUDA Stream, which is needed for communication hiding +//! \param sendProcessNeighborDev, recvProcessNeighborDev, sendProcessNeighborHost, recvProcessNeighborHost are pointers +//! to the send and receive arrays, both on the device and the host extern "C" void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHost); +//! \brief calls exchangeCollDataXGPU27() for exchanging all nodes +//! \details used in the communication after collision step extern "C" void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, int streamIndex); +//! \brief calls exchangeCollDataXGPU27() for exchanging the nodes, which are part of the communication between the two +//! interpolation processes on refined grids \details Only exchange nodes which are part of the interpolation process on +//! refined grids. This function is used in the exchange which takes place after the interpolation fine to coarse and +//! before the interpolation coarse to fine. \ref see master thesis of Anna Wellmann extern "C" void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, int streamIndex); - +//! \brief distribute the receive nodes (x direction) from the buffer on the gpu +//! \details needed to exchange all nodes, used in the communication after collision step extern "C" void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex); +//! \brief distribute the receive nodes (x direction) from the buffer on the gpu +//! \details Only exchange nodes which are part of the interpolation process on refined grids. This function is used in +//! the exchange which takes place after the interpolation fine to coarse and before the interpolation coarse to fine. +//! \ref see master thesis of Anna Wellmann extern "C" void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex); - +////////////////////////////////////////////////////////////////////////// // y -extern "C" void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, - int streamIndex); + +extern "C" void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, int streamIndex); extern "C" void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, int streamIndex); extern "C" void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, @@ -62,7 +106,6 @@ extern "C" void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Commun extern "C" void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex); extern "C" void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex); - // z extern "C" void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, int streamIndex); extern "C" void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, int streamIndex); @@ -81,23 +124,30 @@ extern "C" void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::gpu::Commun extern "C" void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, int streamIndex); extern "C" void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, int streamIndex); - - ////////////////////////////////////////////////////////////////////////// -//3D domain decomposition convection diffusion -extern "C" void exchangePreCollDataADXGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); -extern "C" void exchangePreCollDataADYGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); -extern "C" void exchangePreCollDataADZGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); -extern "C" void exchangePostCollDataADXGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); -extern "C" void exchangePostCollDataADYGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); -extern "C" void exchangePostCollDataADZGPU27(Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); +// 3D domain decomposition convection diffusion +extern "C" void exchangePreCollDataADXGPU27(Parameter *para, vf::gpu::Communicator *comm, + CudaMemoryManager *cudaManager, int level); +extern "C" void exchangePreCollDataADYGPU27(Parameter *para, vf::gpu::Communicator *comm, + CudaMemoryManager *cudaManager, int level); +extern "C" void exchangePreCollDataADZGPU27(Parameter *para, vf::gpu::Communicator *comm, + CudaMemoryManager *cudaManager, int level); +extern "C" void exchangePostCollDataADXGPU27(Parameter *para, vf::gpu::Communicator *comm, + CudaMemoryManager *cudaManager, int level); +extern "C" void exchangePostCollDataADYGPU27(Parameter *para, vf::gpu::Communicator *comm, + CudaMemoryManager *cudaManager, int level); +extern "C" void exchangePostCollDataADZGPU27(Parameter *para, vf::gpu::Communicator *comm, + CudaMemoryManager *cudaManager, int level); ////////////////////////////////////////////////////////////////////////// -//3D domain decomposition F3 - K18/K20 -extern "C" void exchangeCollDataF3XGPU( Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); -extern "C" void exchangeCollDataF3YGPU( Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); -extern "C" void exchangeCollDataF3ZGPU( Parameter* para, vf::gpu::Communicator* comm, CudaMemoryManager* cudaManager, int level); +// 3D domain decomposition F3 - K18/K20 +extern "C" void exchangeCollDataF3XGPU(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, + int level); +extern "C" void exchangeCollDataF3YGPU(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, + int level); +extern "C" void exchangeCollDataF3ZGPU(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, + int level); ////////////////////////////////////////////////////////////////////////// -extern "C" void barrierGPU(vf::gpu::Communicator* comm); +extern "C" void barrierGPU(vf::gpu::Communicator *comm); ////////////////////////////////////////////////////////////////////////// #endif