diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
index 4a0cae9b223f0170ae1831406787bd5b777729a5..bd5b08c431bba65e460bbe984860a38774ea6395 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
@@ -160,6 +160,16 @@ void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe
     if (para->getUseStreams())
         cudaStreamSynchronize(stream);
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // copy corner received node values from x 
+    if (para->getNumberOfProcessNeighborsX(level, "receive") > 0) {
+        for (uint i = 0; i < para->getParH(level)->cornerNodesXtoY.recvPos.size(); i++) {
+            std::pair<int, int> & recvPosX = para->getParH(level)->cornerNodesXtoY.recvPos[i];
+            std::pair<int, int> & sendPosY = para->getParH(level)->cornerNodesXtoY.sendPos[i];
+            real &f = para->getParH(level)->recvProcessNeighborX[recvPosX.first].f[0][recvPosX.second];
+            para->getParH(level)->sendProcessNeighborY[sendPosY.first].f[0][sendPosY.second] = f;
+        }    
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     // 
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
@@ -258,6 +268,27 @@ void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator *comm, CudaMe
     // wait for memcopy device to host to finish before sending data
     if (para->getUseStreams())
         cudaStreamSynchronize(stream);
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // copy corner received node values from x
+    if (para->getNumberOfProcessNeighborsX(level, "receive") > 0) {
+        for (uint i = 0; i < para->getParH(level)->cornerNodesXtoZ.recvPos.size(); i++) {
+            std::pair<int, int> &recvPosX = para->getParH(level)->cornerNodesXtoZ.recvPos[i];
+            std::pair<int, int> &sendPosZ = para->getParH(level)->cornerNodesXtoZ.sendPos[i];
+            real &f = para->getParH(level)->recvProcessNeighborX[recvPosX.first].f[0][recvPosX.second];
+            para->getParH(level)->sendProcessNeighborZ[sendPosZ.first].f[0][sendPosZ.second] = f;
+        }
+    }
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // copy corner received node values from y
+    if (para->getNumberOfProcessNeighborsY(level, "receive") > 0) {
+        for (uint i = 0; i < para->getParH(level)->cornerNodesYtoZ.recvPos.size(); i++) {
+            std::pair<int, int> &recvPosY = para->getParH(level)->cornerNodesYtoZ.recvPos[i];
+            std::pair<int, int> &sendPosZ = para->getParH(level)->cornerNodesYtoZ.sendPos[i];
+            real &f = para->getParH(level)->recvProcessNeighborY[recvPosY.first].f[0][recvPosY.second];
+            para->getParH(level)->sendProcessNeighborZ[sendPosZ.first].f[0][sendPosZ.second] = f;
+        }
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     //start blocking MPI send
     for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
index 30555cde10abaca923a9b8768466111e5b675146..6a4761963c3dd2e8928097d4690a1941c8387d9b 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
@@ -299,7 +299,14 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
    //findPressQShip(para);
    //output << "done.\n";
 
-
+   //////////////////////////////////////////////////////////////////////////
+   // find indices of corner nodes for multiGPU communication
+   //////////////////////////////////////////////////////////////////////////
+   if (para->getDevices().size() > 2) {
+       output << "Find indices of corner nodes for multiGPU communication ...";
+       para->findCornerNodesCommMultiGPU();
+       output << "done.\n";
+   }
    //////////////////////////////////////////////////////////////////////////
    //Memory alloc for CheckPoint / Restart
    //////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
index 6a75de3b0bfb61869cd8d29009cbe133ba9eaaab..2ed642ec5f7e7a324b1d87aa8d95318657f89623 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
@@ -2492,7 +2492,79 @@ void Parameter::setUseStreams() {
 
 bool Parameter::getUseStreams() { return this->useStreams; }
 
-CudaStreamManager &Parameter::getStreamManager()
-{ return this->cudaStreamManager; }
+CudaStreamManager &Parameter::getStreamManager() { return this->cudaStreamManager; }
+
+void Parameter::findCornerNodesCommMultiGPU() { 
+	for (uint level = 0; level < parH.size(); level++) {
+        findCornerNodesXY(level);
+		findCornerNodesXY(level);
+	}
+}
+
+void Parameter::findCornerNodesXY(int level)
+{
+    for (uint i = 0; i < (unsigned int)(this->getNumberOfProcessNeighborsX(level, "receive")); i++)
+        for (int j = 0; j < parH[level]->recvProcessNeighborX[i].numberOfNodes; j++) {
+            int index = parH[level]->recvProcessNeighborX[i].index[j];
+            bool foundIndex = findIndexInSendNodesXY(level, index);
+            if (foundIndex)
+                this->parH[level]->cornerNodesXtoY.recvPos.push_back(std::pair(i, j));
+		}
+}
+
+bool Parameter::findIndexInSendNodesXY(int level, int index) 
+{
+    for (uint k = 0; k < (unsigned int)(this->getNumberOfProcessNeighborsY(level, "send")); k++)
+        for (int l = 0; l < parH[level]->sendProcessNeighborY[l].numberOfNodes; l++)
+            if (parH[level]->sendProcessNeighborY[k].index[l] == index) {
+                this->parH[level]->cornerNodesXtoY.sendPos.push_back(std::pair(k, l));
+                return true;
+            }
+    return false;
+}
+
+void Parameter::findCornerNodesXZ(int level)
+{
+    for (uint i = 0; i < (unsigned int)(this->getNumberOfProcessNeighborsX(level, "receive")); i++)
+        for (int j = 0; j < parH[level]->recvProcessNeighborX[i].numberOfNodes; j++) {
+            int index       = parH[level]->recvProcessNeighborX[i].index[j];
+            bool foundIndex = findIndexInSendNodesXZ(level, index);
+            if (foundIndex)
+                this->parH[level]->cornerNodesXtoZ.recvPos.push_back(std::pair(i, j));
+        }
+}
+
+bool Parameter::findIndexInSendNodesXZ(int level, int index)
+{
+    for (uint k = 0; k < (unsigned int)(this->getNumberOfProcessNeighborsZ(level, "send")); k++)
+        for (int l = 0; l < parH[level]->sendProcessNeighborZ[l].numberOfNodes; l++)
+            if (parH[level]->sendProcessNeighborZ[k].index[l] == index) {
+                this->parH[level]->cornerNodesXtoZ.sendPos.push_back(std::pair(k, l));
+                return true;
+            }
+    return false;
+}
+
+void Parameter::findCornerNodesYZ(int level) 
+{
+    for (uint i = 0; i < (unsigned int)(this->getNumberOfProcessNeighborsY(level, "receive")); i++)
+        for (int j = 0; j < parH[level]->recvProcessNeighborY[i].numberOfNodes; j++) {
+            int index = parH[level]->recvProcessNeighborY[i].index[j];
+            bool foundIndex = findIndexInSendNodesYZ(level, index);
+            if (foundIndex)
+                this->parH[level]->cornerNodesYtoZ.recvPos.push_back(std::pair(i, j));
+        }
+}
+
+bool Parameter::findIndexInSendNodesYZ(int level, int index)
+{
+    for (uint k = 0; k < (unsigned int)(this->getNumberOfProcessNeighborsZ(level, "send")); k++)
+        for (int l = 0; l < parH[level]->sendProcessNeighborZ[l].numberOfNodes; l++)
+            if (parH[level]->sendProcessNeighborZ[k].index[l] == index) {
+                this->parH[level]->cornerNodesYtoZ.sendPos.push_back(std::pair(k, l));
+                return true;
+            }
+    return false;
+}
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
index 782ca95f72b1a534d82a7dc434af43609a6c5fa0..66f36122f08da3910b42ade203cb6ef7fdc628e5 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
@@ -308,7 +308,16 @@ struct LBMSimulationParameter
     std::vector<ProcessNeighborF3> recvProcessNeighborF3Y;
     std::vector<ProcessNeighborF3> recvProcessNeighborF3Z;
     ////////////////////////////////////////////////////////////////////////////
+    // 3D domain decomposition: position (index in array) of corner nodes in ProcessNeighbor27
+    struct cornerNodePostions {
+        std::vector<std::pair<int, int>> recvPos;
+        std::vector<std::pair<int, int>> sendPos;
+    };
+    cornerNodePostions cornerNodesXtoY;
+    cornerNodePostions cornerNodesXtoZ;
+    cornerNodePostions cornerNodesYtoZ;
 
+    ///////////////////////////////////////////////////////
     uint *fluidNodeIndices;
     uint numberOfFluidNodes;
     uint *fluidNodeIndicesBorder;
@@ -854,6 +863,14 @@ public:
     void setUseStreams();
     bool getUseStreams();
     CudaStreamManager &getStreamManager();
+
+    void findCornerNodesCommMultiGPU();
+    void findCornerNodesXY(int level);
+    bool findIndexInSendNodesXY(int level, int index);
+    void findCornerNodesXZ(int level);
+    bool findIndexInSendNodesXZ(int level, int index);
+    void findCornerNodesYZ(int level);
+    bool findIndexInSendNodesYZ(int level, int index);
 };
 
 #endif