From c5a25580df768d48a6a9a69b17dde2ec549e6a1d Mon Sep 17 00:00:00 2001
From: Anna Wellmann <a.wellmann@tu-braunschweig.de>
Date: Thu, 2 Sep 2021 12:36:39 +0200
Subject: [PATCH] Fix corner node communication for MultiGPU

---
 .../Communication/ExchangeData27.cpp          | 31 ++++++++
 src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp  |  9 ++-
 .../VirtualFluids_GPU/Parameter/Parameter.cpp | 76 ++++++++++++++++++-
 .../VirtualFluids_GPU/Parameter/Parameter.h   | 17 +++++
 4 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
index 4a0cae9b2..bd5b08c43 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
@@ -160,6 +160,16 @@ void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe
     if (para->getUseStreams())
         cudaStreamSynchronize(stream);
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // copy corner received node values from x 
+    if (para->getNumberOfProcessNeighborsX(level, "receive") > 0) {
+        for (uint i = 0; i < para->getParH(level)->cornerNodesXtoY.recvPos.size(); i++) {
+            std::pair<int, int> & recvPosX = para->getParH(level)->cornerNodesXtoY.recvPos[i];
+            std::pair<int, int> & sendPosY = para->getParH(level)->cornerNodesXtoY.sendPos[i];
+            real &f = para->getParH(level)->recvProcessNeighborX[recvPosX.first].f[0][recvPosX.second];
+            para->getParH(level)->sendProcessNeighborY[sendPosY.first].f[0][sendPosY.second] = f;
+        }    
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // 
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
@@ -258,6 +268,27 @@ void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe
     // wait for memcopy device to host to finish before sending data
     if (para->getUseStreams())
         cudaStreamSynchronize(stream);
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // copy corner received node values from x
+    if (para->getNumberOfProcessNeighborsX(level, "receive") > 0) {
+        for (uint i = 0; i < para->getParH(level)->cornerNodesXtoZ.recvPos.size(); i++) {
+            std::pair<int, int> &recvPosX = para->getParH(level)->cornerNodesXtoZ.recvPos[i];
+            std::pair<int, int> &sendPosZ = para->getParH(level)->cornerNodesXtoZ.sendPos[i];
+            real &f = para->getParH(level)->recvProcessNeighborX[recvPosX.first].f[0][recvPosX.second];
+            para->getParH(level)->sendProcessNeighborZ[sendPosZ.first].f[0][sendPosZ.second] = f;
+        }
+    }
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // copy corner received node values from y
+    if (para->getNumberOfProcessNeighborsY(level, "receive") > 0) {
+        for (uint i = 0; i < para->getParH(level)->cornerNodesYtoZ.recvPos.size(); i++) {
+            std::pair<int, int> &recvPosY = para->getParH(level)->cornerNodesYtoZ.recvPos[i];
+            std::pair<int, int> &sendPosZ = para->getParH(level)->cornerNodesYtoZ.sendPos[i];
+            real &f = para->getParH(level)->recvProcessNeighborY[recvPosY.first].f[0][recvPosY.second];
+            para->getParH(level)->sendProcessNeighborZ[sendPosZ.first].f[0][sendPosZ.second] = f;
+        }
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
index 30555cde1..6a4761963 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
@@ -299,7 +299,14 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
    //findPressQShip(para);
    //output << "done.\n";
 
-
+   //////////////////////////////////////////////////////////////////////////
+   // find indices of corner nodes for multiGPU communication
+   //////////////////////////////////////////////////////////////////////////
+   if (para->getDevices().size() > 2) {
+       output << "Find indices of corner nodes for multiGPU communication ...";
+       para->findCornerNodesCommMultiGPU();
+       output << "done.\n";
+   }
    //////////////////////////////////////////////////////////////////////////
    //Memory alloc for CheckPoint / Restart
    //////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
index 6a75de3b0..2ed642ec5 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
@@ -2492,7 +2492,79 @@ void Parameter::setUseStreams() {
 
 bool Parameter::getUseStreams() { return this->useStreams; }
 
-CudaStreamManager &Parameter::getStreamManager()
-{ return this->cudaStreamManager; }
+CudaStreamManager &Parameter::getStreamManager() { return this->cudaStreamManager; }
+
+void Parameter::findCornerNodesCommMultiGPU() { 
+	for (uint level = 0; level < parH.size(); level++) {
+        findCornerNodesXY(level);
+		findCornerNodesXY(level);
+	}
+}
+
+void Parameter::findCornerNodesXY(int level)
+{
+    for (uint i = 0; i < (unsigned int)(this->getNumberOfProcessNeighborsX(level, "receive")); i++)
+        for (int j = 0; j < parH[level]->recvProcessNeighborX[i].numberOfNodes; j++) {
+            int index = parH[level]->recvProcessNeighborX[i].index[j];
+            bool foundIndex = findIndexInSendNodesXY(level, index);
+            if (foundIndex)
+                this->parH[level]->cornerNodesXtoY.recvPos.push_back(std::pair(i, j));
+		}
+}
+
+bool Parameter::findIndexInSendNodesXY(int level, int index) 
+{
+    for (uint k = 0; k < (unsigned int)(this->getNumberOfProcessNeighborsY(level, "send")); k++)
+        for (int l = 0; l < parH[level]->sendProcessNeighborY[l].numberOfNodes; l++)
+            if (parH[level]->sendProcessNeighborY[k].index[l] == index) {
+                this->parH[level]->cornerNodesXtoY.sendPos.push_back(std::pair(k, l));
+                return true;
+            }
+    return false;
+}
+
+void Parameter::findCornerNodesXZ(int level)
+{
+    for (uint i = 0; i < (unsigned int)(this->getNumberOfProcessNeighborsX(level, "receive")); i++)
+        for (int j = 0; j < parH[level]->recvProcessNeighborX[i].numberOfNodes; j++) {
+            int index       = parH[level]->recvProcessNeighborX[i].index[j];
+            bool foundIndex = findIndexInSendNodesXZ(level, index);
+            if (foundIndex)
+                this->parH[level]->cornerNodesXtoZ.recvPos.push_back(std::pair(i, j));
+        }
+}
+
+bool Parameter::findIndexInSendNodesXZ(int level, int index)
+{
+    for (uint k = 0; k < (unsigned int)(this->getNumberOfProcessNeighborsZ(level, "send")); k++)
+        for (int l = 0; l < parH[level]->sendProcessNeighborZ[l].numberOfNodes; l++)
+            if (parH[level]->sendProcessNeighborZ[k].index[l] == index) {
+                this->parH[level]->cornerNodesXtoZ.sendPos.push_back(std::pair(k, l));
+                return true;
+            }
+    return false;
+}
+
+void Parameter::findCornerNodesYZ(int level) 
+{
+    for (uint i = 0; i < (unsigned int)(this->getNumberOfProcessNeighborsY(level, "receive")); i++)
+        for (int j = 0; j < parH[level]->recvProcessNeighborY[i].numberOfNodes; j++) {
+            int index = parH[level]->recvProcessNeighborY[i].index[j];
+            bool foundIndex = findIndexInSendNodesYZ(level, index);
+            if (foundIndex)
+                this->parH[level]->cornerNodesYtoZ.recvPos.push_back(std::pair(i, j));
+        }
+}
+
+bool Parameter::findIndexInSendNodesYZ(int level, int index)
+{
+    for (uint k = 0; k < (unsigned int)(this->getNumberOfProcessNeighborsZ(level, "send")); k++)
+        for (int l = 0; l < parH[level]->sendProcessNeighborZ[l].numberOfNodes; l++)
+            if (parH[level]->sendProcessNeighborZ[k].index[l] == index) {
+                this->parH[level]->cornerNodesYtoZ.sendPos.push_back(std::pair(k, l));
+                return true;
+            }
+    return false;
+}
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
index 782ca95f7..66f36122f 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
@@ -308,7 +308,16 @@ struct LBMSimulationParameter
     std::vector<ProcessNeighborF3> recvProcessNeighborF3Y;
     std::vector<ProcessNeighborF3> recvProcessNeighborF3Z;
     ////////////////////////////////////////////////////////////////////////////
+    // 3D domain decomposition: position (index in array) of corner nodes in ProcessNeighbor27
+    struct cornerNodePostions {
+        std::vector<std::pair<int, int>> recvPos;
+        std::vector<std::pair<int, int>> sendPos;
+    };
+    cornerNodePostions cornerNodesXtoY;
+    cornerNodePostions cornerNodesXtoZ;
+    cornerNodePostions cornerNodesYtoZ;
 
+    ///////////////////////////////////////////////////////
     uint *fluidNodeIndices;
     uint numberOfFluidNodes;
     uint *fluidNodeIndicesBorder;
@@ -854,6 +863,14 @@ public:
     void setUseStreams();
     bool getUseStreams();
     CudaStreamManager &getStreamManager();
+
+    void findCornerNodesCommMultiGPU();
+    void findCornerNodesXY(int level);
+    bool findIndexInSendNodesXY(int level, int index);
+    void findCornerNodesXZ(int level);
+    bool findIndexInSendNodesXZ(int level, int index);
+    void findCornerNodesYZ(int level);
+    bool findIndexInSendNodesYZ(int level, int index);
 };
 
 #endif
-- 
GitLab