diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp index 2f2e57bf4f97600012d1dad835956e89c5722b91..167a0ed3fa3bc47a176528026214ca2224a48842 100644 --- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp +++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp @@ -1,16 +1,16 @@ #include "Communication/ExchangeData27.h" +#include "Parameter/CudaStreamManager.h" #include <cuda_runtime.h> #include <helper_cuda.h> -#include "Parameter/CudaStreamManager.h" //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//3D domain decomposition +// 3D domain decomposition //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // 3D domain decomposition: functions used by all directions //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex, - std::vector<ProcessNeighbor27> *sendProcessNeighbor, - unsigned int numberOfSendProcessNeighbors) + std::vector<ProcessNeighbor27> *sendProcessNeighbor, + unsigned int numberOfSendProcessNeighbors) { cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); @@ -30,8 +30,8 @@ void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex, } void scatterNodesFromRecvBufferGPU(Parameter *para, int level, int streamIndex, - std::vector<ProcessNeighbor27> *recvProcessNeighborDev, - unsigned int numberOfRecvProcessNeighbors) + std::vector<ProcessNeighbor27> *recvProcessNeighborDev, + unsigned int numberOfRecvProcessNeighbors) { cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); for (unsigned int i = 0; i < numberOfRecvProcessNeighbors; i++) { @@ -69,16 +69,15 @@ void startNonBlockingMpiReceive(unsigned int numberOfSendProcessNeighbors, vf::g } } - void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeNodes, - std::vector<ProcessNeighbor27> &recvProcessNeighborHost, std::vector<ProcessNeighbor27> &sendProcessNeighborHost) + std::vector<ProcessNeighbor27> &recvProcessNeighborHost, + std::vector<ProcessNeighbor27> &sendProcessNeighborHost) { - int indexInSubdomainRecv = 0; int indexInSubdomainSend = 0; int numNodesInBufferRecv = 0; int numNodesInBufferSend = 0; - + #pragma omp parallel for for (uint i = 0; i < edgeNodes.size(); i++) { indexInSubdomainRecv = edgeNodes[i].indexOfProcessNeighborRecv; @@ -90,27 +89,28 @@ void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeN continue; } - for (int direction = 0; direction <= (int) dirEND; direction++) { - (sendProcessNeighborHost[indexInSubdomainSend].f[0] + (direction * numNodesInBufferSend))[edgeNodes[i].indexInSendBuffer] = - (recvProcessNeighborHost[indexInSubdomainRecv].f[0] + (direction * numNodesInBufferRecv))[edgeNodes[i].indexInRecvBuffer]; + for (int direction = 0; direction <= (int)dirEND; direction++) { + (sendProcessNeighborHost[indexInSubdomainSend].f[0] + + (direction * numNodesInBufferSend))[edgeNodes[i].indexInSendBuffer] = + (recvProcessNeighborHost[indexInSubdomainRecv].f[0] + + (direction * numNodesInBufferRecv))[edgeNodes[i].indexInRecvBuffer]; } } } - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // X //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, int streamIndex) { collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborX, - (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); + (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); } void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, int streamIndex) { collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborsAfterFtoCX, - (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); + (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); } void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, @@ -133,13 +133,13 @@ void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator *com ¶->getParH(level)->recvProcessNeighborsAfterFtoCX); } -void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex) { scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborX, (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); } -void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex) { scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborsAfterFtoCX, (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); @@ -154,7 +154,7 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe { cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host + // copy Device to Host for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) cudaManager->cudaCopyProcessNeighborXFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); @@ -166,35 +166,32 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait + // wait for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) comm->waitGPU(i); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array + // reset the request array if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) comm->resetRequest(); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device + // copy Host to Device for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) cudaManager->cudaCopyProcessNeighborXFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Y //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, int streamIndex) { collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborY, - (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); + (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); } void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, int streamIndex) { collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborsAfterFtoCY, - (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); + (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); } void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, @@ -206,7 +203,7 @@ void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm ¶->getParH(level)->sendProcessNeighborY, ¶->getParH(level)->recvProcessNeighborY); } - + void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, int streamIndex) { @@ -217,28 +214,27 @@ void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator *com ¶->getParH(level)->recvProcessNeighborsAfterFtoCY); } -void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex) { scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborY, (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); } -void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex) +void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex) { scatterNodesFromRecvBufferGPU(para, level, streamIndex, ¶->getParD(level)->recvProcessNeighborsAfterFtoCY, (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); } void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, - int streamIndex, - std::vector<ProcessNeighbor27> *sendProcessNeighborDev, + int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHost) { cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host + // copy Device to Host for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) cudaManager->cudaCopyProcessNeighborYFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); @@ -246,49 +242,52 @@ void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // wait for memcopy device to host to finish before sending data - if (para->getUseStreams()) cudaStreamSynchronize(stream); + if (para->getUseStreams()) + cudaStreamSynchronize(stream); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // edge nodes: copy received node values from x - if (para->getUseStreams() && para->getNumberOfProcessNeighborsX(level, "recv") > 0 && para->getParH(level)->sendProcessNeighborY.size() != 0) { - if( para->getParH(level)->sendProcessNeighborY[0].numberOfNodes == (*sendProcessNeighborHost)[0].numberOfNodes){ + if (para->getUseStreams() && para->getNumberOfProcessNeighborsX(level, "recv") > 0 && + para->getParH(level)->sendProcessNeighborY.size() != 0) { + if (para->getParH(level)->sendProcessNeighborY[0].numberOfNodes == + (*sendProcessNeighborHost)[0].numberOfNodes) { // check if in communication of all nodes (as opposed to reduced communication after fine to coarse) - copyEdgeNodes(para->getParH(level)->edgeNodesXtoY, para->getParH(level)->recvProcessNeighborX, *sendProcessNeighborHost); - } else{ - copyEdgeNodes(para->getParH(level)->edgeNodesXtoY, para->getParH(level)->recvProcessNeighborsAfterFtoCX, *sendProcessNeighborHost); + copyEdgeNodes(para->getParH(level)->edgeNodesXtoY, para->getParH(level)->recvProcessNeighborX, + *sendProcessNeighborHost); + } else { + copyEdgeNodes(para->getParH(level)->edgeNodesXtoY, para->getParH(level)->recvProcessNeighborsAfterFtoCX, + *sendProcessNeighborHost); } } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait + // wait for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) comm->waitGPU(i); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array + // reset the request array if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) comm->resetRequest(); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device - for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) - { + // copy Host to Device + for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) { cudaManager->cudaCopyProcessNeighborYFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Z //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, int streamIndex) { collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborZ, - (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); + (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); } void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, int streamIndex) { collectNodesInSendBufferGPU(para, level, streamIndex, ¶->getParD(level)->sendProcessNeighborsAfterFtoCZ, - (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); + (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); } void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, @@ -324,52 +323,60 @@ void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, int s //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, - int streamIndex, - std::vector<ProcessNeighbor27> *sendProcessNeighborDev, + int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *recvProcessNeighborHost) { cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Device to Host + // copy Device to Host for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) cudaManager->cudaCopyProcessNeighborZFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost); ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // wait for memcopy device to host to finish before sending data - if (para->getUseStreams()) cudaStreamSynchronize(stream); + if (para->getUseStreams()) + cudaStreamSynchronize(stream); //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // edge nodes: copy received node values from x - if (para->getUseStreams() && para->getNumberOfProcessNeighborsX(level, "recv") > 0 && para->getParH(level)->sendProcessNeighborZ.size() != 0) { - if( para->getParH(level)->sendProcessNeighborZ[0].numberOfNodes == (*sendProcessNeighborHost)[0].numberOfNodes){ - // check if in communication of all nodes (as opposed to reduced communication after fine to coarse) - copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, para->getParH(level)->recvProcessNeighborX, *sendProcessNeighborHost); - } else{ - copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, para->getParH(level)->recvProcessNeighborsAfterFtoCX, *sendProcessNeighborHost); + if (para->getUseStreams() && para->getNumberOfProcessNeighborsX(level, "recv") > 0 && + para->getParH(level)->sendProcessNeighborZ.size() != 0) { + if (para->getParH(level)->sendProcessNeighborZ[0].numberOfNodes == + (*sendProcessNeighborHost)[0].numberOfNodes) { + // check if in communication of all nodes (as opposed to reduced communication after fine to coarse) + copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, para->getParH(level)->recvProcessNeighborX, + *sendProcessNeighborHost); + } else { + copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, para->getParH(level)->recvProcessNeighborsAfterFtoCX, + *sendProcessNeighborHost); } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // edge nodes: copy received node values from y - if (para->getUseStreams() && para->getNumberOfProcessNeighborsY(level, "recv") > 0 && para->getParH(level)->sendProcessNeighborZ.size() != 0) { - if( para->getParH(level)->sendProcessNeighborZ[0].numberOfNodes == (*sendProcessNeighborHost)[0].numberOfNodes){ + if (para->getUseStreams() && para->getNumberOfProcessNeighborsY(level, "recv") > 0 && + para->getParH(level)->sendProcessNeighborZ.size() != 0) { + if (para->getParH(level)->sendProcessNeighborZ[0].numberOfNodes == + (*sendProcessNeighborHost)[0].numberOfNodes) { // check if in communication of all nodes (as opposed to reduced communication after fine to coarse) - copyEdgeNodes(para->getParH(level)->edgeNodesYtoZ, para->getParH(level)->recvProcessNeighborY, *sendProcessNeighborHost); - } else{ - copyEdgeNodes(para->getParH(level)->edgeNodesYtoZ, para->getParH(level)->recvProcessNeighborsAfterFtoCY, *sendProcessNeighborHost); + copyEdgeNodes(para->getParH(level)->edgeNodesYtoZ, para->getParH(level)->recvProcessNeighborY, + *sendProcessNeighborHost); + } else { + copyEdgeNodes(para->getParH(level)->edgeNodesYtoZ, para->getParH(level)->recvProcessNeighborsAfterFtoCY, + *sendProcessNeighborHost); } } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //Wait + // wait for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) comm->waitGPU(i); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //reset the request array + // reset the request array if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) comm->resetRequest(); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - //copy Host to Device + // copy Host to Device for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) { cudaManager->cudaCopyProcessNeighborZFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27Test.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27Test.cpp index ed4470e79e913502bf219e649b4cc08ad6734113..3afedfb061211a15b74573d4e6043e8c3e59671b 100644 --- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27Test.cpp +++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27Test.cpp @@ -31,7 +31,7 @@ protected: int level = 0; int numNodes = 10; std::vector<real> recvFs; - std::vector<real> sendFs; + std::vector<real> sendFs; std::vector<ProcessNeighbor27> sendProcessNeighborHost; std::vector<ProcessNeighbor27> recvProcessNeighborHost; @@ -47,15 +47,14 @@ protected: para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 7, 0, 8); para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 7, 0, 8); para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 7, 0, 8); - } - void setUpRecvProcessNeighbors(int numberOfNodesInRecv) + void setUpRecvProcessNeighbors(int numberOfNodesInRecv) { recvFs.resize(numberOfNodesInRecv); std::fill(recvFs.begin(), recvFs.end(), 0.5); // 0.5s should not be copied for (LBMSimulationParameter::EdgeNodePositions edgeNode : para->getParH(level)->edgeNodesXtoZ) { - if(edgeNode.indexInRecvBuffer>numberOfNodesInRecv){ + if (edgeNode.indexInRecvBuffer > numberOfNodesInRecv) { continue; } recvFs[edgeNode.indexInRecvBuffer] = 0.1; // 0.1s should be copied