Skip to content
Snippets Groups Projects
Commit 9a018cb6 authored by Anna Wellmann's avatar Anna Wellmann
Browse files

Reformat ExchangeData27 and ExchangeData27Test

parent 440a31a2
No related branches found
No related tags found
1 merge request!104Add Communication Hiding to GPU version
#include "Communication/ExchangeData27.h" #include "Communication/ExchangeData27.h"
#include "Parameter/CudaStreamManager.h"
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include "Parameter/CudaStreamManager.h"
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//3D domain decomposition // 3D domain decomposition
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// 3D domain decomposition: functions used by all directions // 3D domain decomposition: functions used by all directions
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex, void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex,
std::vector<ProcessNeighbor27> *sendProcessNeighbor, std::vector<ProcessNeighbor27> *sendProcessNeighbor,
unsigned int numberOfSendProcessNeighbors) unsigned int numberOfSendProcessNeighbors)
{ {
cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
...@@ -30,8 +30,8 @@ void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex, ...@@ -30,8 +30,8 @@ void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex,
} }
void scatterNodesFromRecvBufferGPU(Parameter *para, int level, int streamIndex, void scatterNodesFromRecvBufferGPU(Parameter *para, int level, int streamIndex,
std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
unsigned int numberOfRecvProcessNeighbors) unsigned int numberOfRecvProcessNeighbors)
{ {
cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
for (unsigned int i = 0; i < numberOfRecvProcessNeighbors; i++) { for (unsigned int i = 0; i < numberOfRecvProcessNeighbors; i++) {
...@@ -69,16 +69,15 @@ void startNonBlockingMpiReceive(unsigned int numberOfSendProcessNeighbors, vf::g ...@@ -69,16 +69,15 @@ void startNonBlockingMpiReceive(unsigned int numberOfSendProcessNeighbors, vf::g
} }
} }
void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeNodes, void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeNodes,
std::vector<ProcessNeighbor27> &recvProcessNeighborHost, std::vector<ProcessNeighbor27> &sendProcessNeighborHost) std::vector<ProcessNeighbor27> &recvProcessNeighborHost,
std::vector<ProcessNeighbor27> &sendProcessNeighborHost)
{ {
int indexInSubdomainRecv = 0; int indexInSubdomainRecv = 0;
int indexInSubdomainSend = 0; int indexInSubdomainSend = 0;
int numNodesInBufferRecv = 0; int numNodesInBufferRecv = 0;
int numNodesInBufferSend = 0; int numNodesInBufferSend = 0;
#pragma omp parallel for #pragma omp parallel for
for (uint i = 0; i < edgeNodes.size(); i++) { for (uint i = 0; i < edgeNodes.size(); i++) {
indexInSubdomainRecv = edgeNodes[i].indexOfProcessNeighborRecv; indexInSubdomainRecv = edgeNodes[i].indexOfProcessNeighborRecv;
...@@ -90,27 +89,28 @@ void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeN ...@@ -90,27 +89,28 @@ void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeN
continue; continue;
} }
for (int direction = 0; direction <= (int) dirEND; direction++) { for (int direction = 0; direction <= (int)dirEND; direction++) {
(sendProcessNeighborHost[indexInSubdomainSend].f[0] + (direction * numNodesInBufferSend))[edgeNodes[i].indexInSendBuffer] = (sendProcessNeighborHost[indexInSubdomainSend].f[0] +
(recvProcessNeighborHost[indexInSubdomainRecv].f[0] + (direction * numNodesInBufferRecv))[edgeNodes[i].indexInRecvBuffer]; (direction * numNodesInBufferSend))[edgeNodes[i].indexInSendBuffer] =
(recvProcessNeighborHost[indexInSubdomainRecv].f[0] +
(direction * numNodesInBufferRecv))[edgeNodes[i].indexInRecvBuffer];
} }
} }
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// X // X
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, int streamIndex) void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, int streamIndex)
{ {
collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborX, collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborX,
(unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
} }
void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, int streamIndex) void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
{ {
collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborsAfterFtoCX, collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborsAfterFtoCX,
(unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
} }
void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager,
...@@ -133,13 +133,13 @@ void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator *com ...@@ -133,13 +133,13 @@ void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator *com
&para->getParH(level)->recvProcessNeighborsAfterFtoCX); &para->getParH(level)->recvProcessNeighborsAfterFtoCX);
} }
void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex) void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex)
{ {
scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborX, scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborX,
(unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
} }
void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex) void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
{ {
scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborsAfterFtoCX, scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborsAfterFtoCX,
(unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))); (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
...@@ -154,7 +154,7 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe ...@@ -154,7 +154,7 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe
{ {
cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//copy Device to Host // copy Device to Host
for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
cudaManager->cudaCopyProcessNeighborXFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); cudaManager->cudaCopyProcessNeighborXFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex);
...@@ -166,35 +166,32 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe ...@@ -166,35 +166,32 @@ void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost); startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//Wait // wait
for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) comm->waitGPU(i); for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) comm->waitGPU(i);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//reset the request array // reset the request array
if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) comm->resetRequest(); if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) comm->resetRequest();
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//copy Host to Device // copy Host to Device
for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
cudaManager->cudaCopyProcessNeighborXFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); cudaManager->cudaCopyProcessNeighborXFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Y // Y
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, int streamIndex) void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, int streamIndex)
{ {
collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborY, collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborY,
(unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
} }
void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, int streamIndex) void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
{ {
collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborsAfterFtoCY, collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborsAfterFtoCY,
(unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
} }
void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager,
...@@ -206,7 +203,7 @@ void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm ...@@ -206,7 +203,7 @@ void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm
&para->getParH(level)->sendProcessNeighborY, &para->getParH(level)->sendProcessNeighborY,
&para->getParH(level)->recvProcessNeighborY); &para->getParH(level)->recvProcessNeighborY);
} }
void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager,
int level, int streamIndex) int level, int streamIndex)
{ {
...@@ -217,28 +214,27 @@ void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator *com ...@@ -217,28 +214,27 @@ void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator *com
&para->getParH(level)->recvProcessNeighborsAfterFtoCY); &para->getParH(level)->recvProcessNeighborsAfterFtoCY);
} }
void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex) void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex)
{ {
scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborY, scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborY,
(unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
} }
void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex) void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
{ {
scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborsAfterFtoCY, scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborsAfterFtoCY,
(unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))); (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
} }
void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level,
int streamIndex, int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
std::vector<ProcessNeighbor27> *recvProcessNeighborHost) std::vector<ProcessNeighbor27> *recvProcessNeighborHost)
{ {
cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//copy Device to Host // copy Device to Host
for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
cudaManager->cudaCopyProcessNeighborYFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); cudaManager->cudaCopyProcessNeighborYFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex);
...@@ -246,49 +242,52 @@ void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe ...@@ -246,49 +242,52 @@ void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe
startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost); startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// wait for memcopy device to host to finish before sending data // wait for memcopy device to host to finish before sending data
if (para->getUseStreams()) cudaStreamSynchronize(stream); if (para->getUseStreams())
cudaStreamSynchronize(stream);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// edge nodes: copy received node values from x // edge nodes: copy received node values from x
if (para->getUseStreams() && para->getNumberOfProcessNeighborsX(level, "recv") > 0 && para->getParH(level)->sendProcessNeighborY.size() != 0) { if (para->getUseStreams() && para->getNumberOfProcessNeighborsX(level, "recv") > 0 &&
if( para->getParH(level)->sendProcessNeighborY[0].numberOfNodes == (*sendProcessNeighborHost)[0].numberOfNodes){ para->getParH(level)->sendProcessNeighborY.size() != 0) {
if (para->getParH(level)->sendProcessNeighborY[0].numberOfNodes ==
(*sendProcessNeighborHost)[0].numberOfNodes) {
// check if in communication of all nodes (as opposed to reduced communication after fine to coarse) // check if in communication of all nodes (as opposed to reduced communication after fine to coarse)
copyEdgeNodes(para->getParH(level)->edgeNodesXtoY, para->getParH(level)->recvProcessNeighborX, *sendProcessNeighborHost); copyEdgeNodes(para->getParH(level)->edgeNodesXtoY, para->getParH(level)->recvProcessNeighborX,
} else{ *sendProcessNeighborHost);
copyEdgeNodes(para->getParH(level)->edgeNodesXtoY, para->getParH(level)->recvProcessNeighborsAfterFtoCX, *sendProcessNeighborHost); } else {
copyEdgeNodes(para->getParH(level)->edgeNodesXtoY, para->getParH(level)->recvProcessNeighborsAfterFtoCX,
*sendProcessNeighborHost);
} }
} }
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost); startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//Wait // wait
for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) comm->waitGPU(i); for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) comm->waitGPU(i);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//reset the request array // reset the request array
if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) comm->resetRequest(); if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) comm->resetRequest();
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//copy Host to Device // copy Host to Device
for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) {
{
cudaManager->cudaCopyProcessNeighborYFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); cudaManager->cudaCopyProcessNeighborYFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex);
} }
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Z // Z
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, int streamIndex) void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, int streamIndex)
{ {
collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborZ, collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborZ,
(unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")));
} }
void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, int streamIndex) void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
{ {
collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborsAfterFtoCZ, collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborsAfterFtoCZ,
(unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))); (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")));
} }
void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager,
...@@ -324,52 +323,60 @@ void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, int s ...@@ -324,52 +323,60 @@ void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, int s
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level, void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMemoryManager *cudaManager, int level,
int streamIndex, int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
std::vector<ProcessNeighbor27> *recvProcessNeighborDev, std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
std::vector<ProcessNeighbor27> *sendProcessNeighborHost, std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
std::vector<ProcessNeighbor27> *recvProcessNeighborHost) std::vector<ProcessNeighbor27> *recvProcessNeighborHost)
{ {
cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex); cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//copy Device to Host // copy Device to Host
for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
cudaManager->cudaCopyProcessNeighborZFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex); cudaManager->cudaCopyProcessNeighborZFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost); startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost);
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// wait for memcopy device to host to finish before sending data // wait for memcopy device to host to finish before sending data
if (para->getUseStreams()) cudaStreamSynchronize(stream); if (para->getUseStreams())
cudaStreamSynchronize(stream);
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// edge nodes: copy received node values from x // edge nodes: copy received node values from x
if (para->getUseStreams() && para->getNumberOfProcessNeighborsX(level, "recv") > 0 && para->getParH(level)->sendProcessNeighborZ.size() != 0) { if (para->getUseStreams() && para->getNumberOfProcessNeighborsX(level, "recv") > 0 &&
if( para->getParH(level)->sendProcessNeighborZ[0].numberOfNodes == (*sendProcessNeighborHost)[0].numberOfNodes){ para->getParH(level)->sendProcessNeighborZ.size() != 0) {
// check if in communication of all nodes (as opposed to reduced communication after fine to coarse) if (para->getParH(level)->sendProcessNeighborZ[0].numberOfNodes ==
copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, para->getParH(level)->recvProcessNeighborX, *sendProcessNeighborHost); (*sendProcessNeighborHost)[0].numberOfNodes) {
} else{ // check if in communication of all nodes (as opposed to reduced communication after fine to coarse)
copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, para->getParH(level)->recvProcessNeighborsAfterFtoCX, *sendProcessNeighborHost); copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, para->getParH(level)->recvProcessNeighborX,
*sendProcessNeighborHost);
} else {
copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, para->getParH(level)->recvProcessNeighborsAfterFtoCX,
*sendProcessNeighborHost);
} }
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// edge nodes: copy received node values from y // edge nodes: copy received node values from y
if (para->getUseStreams() && para->getNumberOfProcessNeighborsY(level, "recv") > 0 && para->getParH(level)->sendProcessNeighborZ.size() != 0) { if (para->getUseStreams() && para->getNumberOfProcessNeighborsY(level, "recv") > 0 &&
if( para->getParH(level)->sendProcessNeighborZ[0].numberOfNodes == (*sendProcessNeighborHost)[0].numberOfNodes){ para->getParH(level)->sendProcessNeighborZ.size() != 0) {
if (para->getParH(level)->sendProcessNeighborZ[0].numberOfNodes ==
(*sendProcessNeighborHost)[0].numberOfNodes) {
// check if in communication of all nodes (as opposed to reduced communication after fine to coarse) // check if in communication of all nodes (as opposed to reduced communication after fine to coarse)
copyEdgeNodes(para->getParH(level)->edgeNodesYtoZ, para->getParH(level)->recvProcessNeighborY, *sendProcessNeighborHost); copyEdgeNodes(para->getParH(level)->edgeNodesYtoZ, para->getParH(level)->recvProcessNeighborY,
} else{ *sendProcessNeighborHost);
copyEdgeNodes(para->getParH(level)->edgeNodesYtoZ, para->getParH(level)->recvProcessNeighborsAfterFtoCY, *sendProcessNeighborHost); } else {
copyEdgeNodes(para->getParH(level)->edgeNodesYtoZ, para->getParH(level)->recvProcessNeighborsAfterFtoCY,
*sendProcessNeighborHost);
} }
} }
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost); startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//Wait // wait
for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) comm->waitGPU(i); for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) comm->waitGPU(i);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//reset the request array // reset the request array
if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) comm->resetRequest(); if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) comm->resetRequest();
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//copy Host to Device // copy Host to Device
for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
{ {
cudaManager->cudaCopyProcessNeighborZFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex); cudaManager->cudaCopyProcessNeighborZFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex);
......
...@@ -31,7 +31,7 @@ protected: ...@@ -31,7 +31,7 @@ protected:
int level = 0; int level = 0;
int numNodes = 10; int numNodes = 10;
std::vector<real> recvFs; std::vector<real> recvFs;
std::vector<real> sendFs; std::vector<real> sendFs;
std::vector<ProcessNeighbor27> sendProcessNeighborHost; std::vector<ProcessNeighbor27> sendProcessNeighborHost;
std::vector<ProcessNeighbor27> recvProcessNeighborHost; std::vector<ProcessNeighbor27> recvProcessNeighborHost;
...@@ -47,15 +47,14 @@ protected: ...@@ -47,15 +47,14 @@ protected:
para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 7, 0, 8); para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 7, 0, 8);
para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 7, 0, 8); para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 7, 0, 8);
para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 7, 0, 8); para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 7, 0, 8);
} }
void setUpRecvProcessNeighbors(int numberOfNodesInRecv) void setUpRecvProcessNeighbors(int numberOfNodesInRecv)
{ {
recvFs.resize(numberOfNodesInRecv); recvFs.resize(numberOfNodesInRecv);
std::fill(recvFs.begin(), recvFs.end(), 0.5); // 0.5s should not be copied std::fill(recvFs.begin(), recvFs.end(), 0.5); // 0.5s should not be copied
for (LBMSimulationParameter::EdgeNodePositions edgeNode : para->getParH(level)->edgeNodesXtoZ) { for (LBMSimulationParameter::EdgeNodePositions edgeNode : para->getParH(level)->edgeNodesXtoZ) {
if(edgeNode.indexInRecvBuffer>numberOfNodesInRecv){ if (edgeNode.indexInRecvBuffer > numberOfNodesInRecv) {
continue; continue;
} }
recvFs[edgeNode.indexInRecvBuffer] = 0.1; // 0.1s should be copied recvFs[edgeNode.indexInRecvBuffer] = 0.1; // 0.1s should be copied
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment