From 4d62677432abf7fcd5184d4aa82a3136cc4e83d1 Mon Sep 17 00:00:00 2001
From: Anna Wellmann <a.wellmann@tu-braunschweig.de>
Date: Wed, 29 Sep 2021 13:34:01 +0200
Subject: [PATCH] Restructure calls to inititialize the communication arrays

For communication after interpolation fine to coarse
---
 .../grid/GridBuilder/GridBuilder.h            |  16 +-
 .../grid/GridBuilder/LevelGridBuilder.cpp     |  23 -
 .../grid/GridBuilder/LevelGridBuilder.h       |  11 +-
 .../GridReaderGenerator/GridGenerator.cpp     | 871 ++++++++++--------
 .../GridReaderGenerator/GridGenerator.h       |   6 +-
 src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp  |  13 +-
 src/gpu/VirtualFluids_GPU/LBM/Simulation.h    |   4 +-
 7 files changed, 490 insertions(+), 454 deletions(-)

diff --git a/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
index f4367f74b..da4ab7e5d 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
@@ -86,14 +86,14 @@ public:
     virtual uint getNumberOfReceiveIndices( int direction, uint level ) = 0;
     virtual void getSendIndices( int* sendIndices, int direction, int level ) = 0;
     virtual void getReceiveIndices( int* sendIndices, int direction, int level ) = 0;
-    virtual std::vector<uint> getAndReorderSendIndices(int *sendIndices, uint &numberOfSendNeighborsAfterFtoC,
-                                                       uint *iCellFCC, uint sizeOfICellFCCBorder, uint *iCellCFC,
-                                                       uint sizeOfICellCFC, uint *neighborX, uint *neighborY,
-                                                       uint *neighborZ, int direction, int level) = 0;
-    virtual void getAndReorderReceiveIndices(int *recvIndices, uint &numberOfRecvNeighborsAfterFtoC,
-                                             std::vector<uint> &sendIndicesForCommAfterFtoCPositions, int direction,
-                                             int level)                                           = 0;
-
+    virtual void reorderRecvIndicesForCommAfterFtoC(int *recvIndices, uint &numberOfRecvNeighborsAfterFtoC,
+                                                    std::vector<uint> &sendIndicesForCommAfterFtoCPositions,
+                                                    int direction, int level) = 0;
+    virtual void reorderSendIndicesForCommAfterFtoC(int *sendIndices, uint &numberOfSendNeighborsAfterFtoC,
+                                                    uint *iCellFCC, uint sizeOfICellFCC, uint *iCellCFC,
+                                                    uint sizeOfICellCFC, uint *neighborX, uint *neighborY,
+                                                    uint *neighborZ, int direction, int level,
+                                                    std::vector<uint> &sendIndicesForCommAfterFtoCPositions) = 0;
 
     virtual uint getNumberOfFluidNodes(unsigned int level) const = 0;
     virtual void getFluidNodeIndices(uint *fluidNodeIndices, const int level) const = 0;
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
index b51dc9dee..026c4baad 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
@@ -267,29 +267,6 @@ GRIDGENERATOR_EXPORT void LevelGridBuilder::getReceiveIndices(int * receiveIndic
     }
 }
 
-GRIDGENERATOR_EXPORT std::vector<uint>
-LevelGridBuilder::getAndReorderSendIndices(int *sendIndices, uint &numberOfSendNeighborsAfterFtoC, uint *iCellFCC,
-                                           uint sizeOfICellFCCBorder, uint *iCellCFC, uint sizeOfICellCFC,
-                                           uint *neighborX, uint *neighborY, uint *neighborZ, int direction, int level)
-{
-    std::vector<uint> sendIndicesForCommAfterFtoCPositions;
-    getSendIndices(sendIndices, direction, level);
-    reorderSendIndicesForCommAfterFtoC(sendIndices, numberOfSendNeighborsAfterFtoC, iCellFCC, sizeOfICellCFC, iCellCFC,
-                                       sizeOfICellCFC, neighborX, neighborY, neighborZ, direction, level,
-                                       sendIndicesForCommAfterFtoCPositions);
-    return sendIndicesForCommAfterFtoCPositions;
-}
-
-GRIDGENERATOR_EXPORT void
-LevelGridBuilder::getAndReorderReceiveIndices(int *recvIndices, uint &numberOfRecvNeighborsAfterFtoC,
-                                              std::vector<uint> &sendIndicesForCommAfterFtoCPositions, int direction,
-                                              int level)
-{
-    getReceiveIndices(recvIndices, direction, level);
-    reorderRecvIndicesForCommAfterFtoC(recvIndices, numberOfRecvNeighborsAfterFtoC, sendIndicesForCommAfterFtoCPositions,
-                                     direction, level);
-}
-
 GRIDGENERATOR_EXPORT void LevelGridBuilder::reorderSendIndicesForCommAfterFtoC(
     int *sendIndices, uint &numberOfSendNeighborsAfterFtoC, uint *iCellFCC, uint sizeOfICellFCC, uint *iCellCFC,
     uint sizeOfICellCFC, uint *neighborX, uint *neighborY, uint *neighborZ, int direction, int level,
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
index 2059438f1..ebb96bf0b 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
@@ -149,24 +149,17 @@ public:
     GRIDGENERATOR_EXPORT uint getNumberOfReceiveIndices( int direction, uint level ) override;
     GRIDGENERATOR_EXPORT void getSendIndices( int* sendIndices, int direction, int level ) override;
     GRIDGENERATOR_EXPORT void getReceiveIndices( int* sendIndices, int direction, int level ) override;
-    GRIDGENERATOR_EXPORT std::vector<uint> GRIDGENERATOR_EXPORT getAndReorderSendIndices(
-        int *sendIndices, uint &numberOfSendNeighborsAfterFtoC, uint *iCellFCC, uint sizeOfICellFCCBorder,
-        uint *iCellCFC, uint sizeOfICellCFC, uint *neighborX, uint *neighborY, uint *neighborZ, int direction,
-        int level) override;
-    GRIDGENERATOR_EXPORT void getAndReorderReceiveIndices(int *recvIndices, uint &numberOfRecvNeighborsAfterFtoC,
-                                                          std::vector<uint> &sendIndicesForCommAfterFtoCPositions,
-                                                          int direction, int level) override;
     GRIDGENERATOR_EXPORT void reorderSendIndicesForCommAfterFtoC(int *sendIndices, uint &numberOfSendNeighborsAfterFtoC,
                                                                uint *iCellFCC, uint sizeOfICellFCC, uint *iCellCFC,
                                                                uint sizeOfICellCFC, uint *neighborX, uint *neighborY,
                                                                uint *neighborZ, int direction, int level,
-                                                               std::vector<uint> &sendIndicesForCommAfterFtoCPositions);
+                                                               std::vector<uint> &sendIndicesForCommAfterFtoCPositions) override;
     void findIfSparseIndexIsInSendIndicesAndAddToVectors(int sparseIndex, int *sendIndices, uint numberOfSendIndices,
                                                          std::vector<int> &sendIndicesAfterFtoC,
                                                          std::vector<uint> &sendIndicesForCommAfterFtoCPositions) const;
     GRIDGENERATOR_EXPORT void reorderRecvIndicesForCommAfterFtoC(int *recvIndices, uint &numberOfRecvNeighborsAfterFtoC,
                                                                std::vector<uint> &sendIndicesForCommAfterFtoCPositions,
-                                                               int direction, int level);
+                                                               int direction, int level) override;
 
 };
 
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
index a6feb721e..879c81596 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
@@ -261,439 +261,494 @@ void GridGenerator::allocArrays_BoundaryValues()
         }
     }//ende geo
 
-    if ((para->getNumprocs() > 1) /*&& (procNeighborsSendX.size() == procNeighborsRecvX.size())*/)
-	{
-		for (int direction = 0; direction < 6; direction++)
-		{
-            if( builder->getCommunicationProcess(direction) == INVALID_INDEX ) continue;
+initalValuesDomainDecompostion();
 
-			for (uint level = 0; level < builder->getNumberOfGridLevels(); level++)
-            {
-                if( direction == CommunicationDirections::MX || direction == CommunicationDirections::PX )
-                {
+}
+
+void GridGenerator::initalValuesDomainDecompostion()
+{
+    if (para->getNumprocs() < 2)
+        return;
+    if ((para->getNumprocs() > 1) /*&& (procNeighborsSendX.size() == procNeighborsRecvX.size())*/) {
+        for (int direction = 0; direction < 6; direction++) {
+            if (builder->getCommunicationProcess(direction) == INVALID_INDEX)
+                continue;
+
+            for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
+                if (direction == CommunicationDirections::MX || direction == CommunicationDirections::PX) {
                     int j = (int)para->getParH(level)->sendProcessNeighborX.size();
 
-		            para->getParH(level)->sendProcessNeighborX.emplace_back();
-		            para->getParD(level)->sendProcessNeighborX.emplace_back();
-		            para->getParH(level)->recvProcessNeighborX.emplace_back();
-		            para->getParD(level)->recvProcessNeighborX.emplace_back();
-		            if (para->getDiffOn()==true){
-			            para->getParH(level)->sendProcessNeighborADX.emplace_back();
-			            para->getParD(level)->sendProcessNeighborADX.emplace_back();
-			            para->getParH(level)->recvProcessNeighborADX.emplace_back();
-			            para->getParD(level)->recvProcessNeighborADX.emplace_back();
-		            }
-
-				    int tempSend = builder->getNumberOfSendIndices( direction, level );
-				    int tempRecv = builder->getNumberOfReceiveIndices( direction, level );
-				    if (tempSend > 0)
-				    {
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //send
-					    std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend << std::endl;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->sendProcessNeighborX.back().rankNeighbor = builder->getCommunicationProcess(direction);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->sendProcessNeighborX.back().numberOfNodes = tempSend;
-					    para->getParD(level)->sendProcessNeighborX.back().numberOfNodes = tempSend;
-					    para->getParH(level)->sendProcessNeighborX.back().numberOfFs = para->getD3Qxx() * tempSend;
-					    para->getParD(level)->sendProcessNeighborX.back().numberOfFs = para->getD3Qxx() * tempSend;
-					    para->getParH(level)->sendProcessNeighborX.back().memsizeIndex = sizeof(unsigned int)*tempSend;
-					    para->getParD(level)->sendProcessNeighborX.back().memsizeIndex = sizeof(unsigned int)*tempSend;
-					    para->getParH(level)->sendProcessNeighborX.back().memsizeFs = sizeof(real)     *tempSend;
-					    para->getParD(level)->sendProcessNeighborX.back().memsizeFs = sizeof(real)     *tempSend;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //recv
-					    std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv << std::endl;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->recvProcessNeighborX.back().rankNeighbor = builder->getCommunicationProcess(direction);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->recvProcessNeighborX.back().numberOfNodes = tempRecv;
-					    para->getParD(level)->recvProcessNeighborX.back().numberOfNodes = tempRecv;
-					    para->getParH(level)->recvProcessNeighborX.back().numberOfFs = para->getD3Qxx() * tempRecv;
-					    para->getParD(level)->recvProcessNeighborX.back().numberOfFs = para->getD3Qxx() * tempRecv;
-					    para->getParH(level)->recvProcessNeighborX.back().memsizeIndex = sizeof(unsigned int)*tempRecv;
-					    para->getParD(level)->recvProcessNeighborX.back().memsizeIndex = sizeof(unsigned int)*tempRecv;
-					    para->getParH(level)->recvProcessNeighborX.back().memsizeFs = sizeof(real)     *tempRecv;
-					    para->getParD(level)->recvProcessNeighborX.back().memsizeFs = sizeof(real)     *tempRecv;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //malloc on host and device
+                    para->getParH(level)->sendProcessNeighborX.emplace_back();
+                    para->getParD(level)->sendProcessNeighborX.emplace_back();
+                    para->getParH(level)->recvProcessNeighborX.emplace_back();
+                    para->getParD(level)->recvProcessNeighborX.emplace_back();
+                    if (para->getDiffOn() == true) {
+                        para->getParH(level)->sendProcessNeighborADX.emplace_back();
+                        para->getParD(level)->sendProcessNeighborADX.emplace_back();
+                        para->getParH(level)->recvProcessNeighborADX.emplace_back();
+                        para->getParD(level)->recvProcessNeighborADX.emplace_back();
+                    }
+
+                    int tempSend = builder->getNumberOfSendIndices(direction, level);
+                    int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
+                    if (tempSend > 0) {
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // send
+                        std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborX.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborX.back().numberOfNodes = tempSend;
+                        para->getParD(level)->sendProcessNeighborX.back().numberOfNodes = tempSend;
+                        para->getParH(level)->sendProcessNeighborX.back().numberOfFs    = para->getD3Qxx() * tempSend;
+                        para->getParD(level)->sendProcessNeighborX.back().numberOfFs    = para->getD3Qxx() * tempSend;
+                        para->getParH(level)->sendProcessNeighborX.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParD(level)->sendProcessNeighborX.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParH(level)->sendProcessNeighborX.back().memsizeFs = sizeof(real) * tempSend;
+                        para->getParD(level)->sendProcessNeighborX.back().memsizeFs = sizeof(real) * tempSend;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // recv
+                        std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborX.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborX.back().numberOfNodes = tempRecv;
+                        para->getParD(level)->recvProcessNeighborX.back().numberOfNodes = tempRecv;
+                        para->getParH(level)->recvProcessNeighborX.back().numberOfFs    = para->getD3Qxx() * tempRecv;
+                        para->getParD(level)->recvProcessNeighborX.back().numberOfFs    = para->getD3Qxx() * tempRecv;
+                        para->getParH(level)->recvProcessNeighborX.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborX.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParH(level)->recvProcessNeighborX.back().memsizeFs = sizeof(real) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborX.back().memsizeFs = sizeof(real) * tempRecv;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // malloc on host and device
                         cudaMemoryManager->cudaAllocProcessNeighborX(level, j);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //init index arrays
-                        if (level == builder->getNumberOfGridLevels() - 1) {
-                            builder->getSendIndices(para->getParH(level)->sendProcessNeighborX[j].index, direction,
-                                                    level);
-                            builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborX[j].index, direction,
-                                                       level);
-                        } else {
-                            para->initNumberOfProcessNeighborsAfterFtoCX(level);
-                            std::vector<uint> sendIndicesForCommAfterFtoCPositions = builder->getAndReorderSendIndices(
-                                para->getParH(level)->sendProcessNeighborX[j].index,
-                                para->getParH(level)->numberOfSendProcessNeighborsAfterFtoCX[j],
-                                para->getParH(level)->intFC.ICellFCC, para->getParH(level)->K_CF,
-                                para->getParH(level)->intCF.ICellCFC, para->getParH(level)->K_FC,
-                                para->getParH(level)->neighborX_SP, para->getParH(level)->neighborY_SP,
-                                para->getParH(level)->neighborZ_SP, direction, level);
-                            builder->getAndReorderReceiveIndices(
-                                para->getParH(level)->recvProcessNeighborX[j].index,
-                                para->getParH(level)->numberOfRecvProcessNeighborsAfterFtoCX[j],
-                                sendIndicesForCommAfterFtoCPositions, direction, level);
-                            para->getParD(level)->numberOfSendProcessNeighborsAfterFtoCX[j] =
-                                para->getParH(level)->numberOfSendProcessNeighborsAfterFtoCX[j];
-                            para->getParD(level)->numberOfRecvProcessNeighborsAfterFtoCX[j] =
-                                para->getParH(level)->numberOfRecvProcessNeighborsAfterFtoCX[j];
-                        }
-					    ////////////////////////////////////////////////////////////////////////////////////////
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // init index arrays
+                        builder->getSendIndices(para->getParH(level)->sendProcessNeighborX[j].index, direction, level);
+                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborX[j].index, direction,
+                                                   level);
+                        if (level != builder->getNumberOfGridLevels() - 1)
+                            initCommunicationArraysForCommAfterFinetoCoarseX(level, j, direction);                        
+                        ////////////////////////////////////////////////////////////////////////////////////////
                         cudaMemoryManager->cudaCopyProcessNeighborXIndex(level, j);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-				    }
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                    }
                 }
-                
-                if( direction == CommunicationDirections::MY || direction == CommunicationDirections::PY )
-                {
+
+                if (direction == CommunicationDirections::MY || direction == CommunicationDirections::PY) {
                     int j = (int)para->getParH(level)->sendProcessNeighborY.size();
 
-		            para->getParH(level)->sendProcessNeighborY.emplace_back();
-		            para->getParD(level)->sendProcessNeighborY.emplace_back();
-		            para->getParH(level)->recvProcessNeighborY.emplace_back();
-		            para->getParD(level)->recvProcessNeighborY.emplace_back();
-		            if (para->getDiffOn()==true){
-			            para->getParH(level)->sendProcessNeighborADY.emplace_back();
-			            para->getParD(level)->sendProcessNeighborADY.emplace_back();
-			            para->getParH(level)->recvProcessNeighborADY.emplace_back();
-			            para->getParD(level)->recvProcessNeighborADY.emplace_back();
-		            }
-
-				    int tempSend = builder->getNumberOfSendIndices( direction, level );
-				    int tempRecv = builder->getNumberOfReceiveIndices( direction, level );
-				    if (tempSend > 0)
-				    {
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //send
-					    std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend << std::endl;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->sendProcessNeighborY.back().rankNeighbor = builder->getCommunicationProcess(direction);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->sendProcessNeighborY.back().numberOfNodes = tempSend;
-					    para->getParD(level)->sendProcessNeighborY.back().numberOfNodes = tempSend;
-					    para->getParH(level)->sendProcessNeighborY.back().numberOfFs = para->getD3Qxx() * tempSend;
-					    para->getParD(level)->sendProcessNeighborY.back().numberOfFs = para->getD3Qxx() * tempSend;
-					    para->getParH(level)->sendProcessNeighborY.back().memsizeIndex = sizeof(unsigned int)*tempSend;
-					    para->getParD(level)->sendProcessNeighborY.back().memsizeIndex = sizeof(unsigned int)*tempSend;
-					    para->getParH(level)->sendProcessNeighborY.back().memsizeFs = sizeof(real)     *tempSend;
-					    para->getParD(level)->sendProcessNeighborY.back().memsizeFs = sizeof(real)     *tempSend;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //recv
-					    std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv << std::endl;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->recvProcessNeighborY.back().rankNeighbor = builder->getCommunicationProcess(direction);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->recvProcessNeighborY.back().numberOfNodes = tempRecv;
-					    para->getParD(level)->recvProcessNeighborY.back().numberOfNodes = tempRecv;
-					    para->getParH(level)->recvProcessNeighborY.back().numberOfFs = para->getD3Qxx() * tempRecv;
-					    para->getParD(level)->recvProcessNeighborY.back().numberOfFs = para->getD3Qxx() * tempRecv;
-					    para->getParH(level)->recvProcessNeighborY.back().memsizeIndex = sizeof(unsigned int)*tempRecv;
-					    para->getParD(level)->recvProcessNeighborY.back().memsizeIndex = sizeof(unsigned int)*tempRecv;
-					    para->getParH(level)->recvProcessNeighborY.back().memsizeFs = sizeof(real)     *tempRecv;
-					    para->getParD(level)->recvProcessNeighborY.back().memsizeFs = sizeof(real)     *tempRecv;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //malloc on host and device
+                    para->getParH(level)->sendProcessNeighborY.emplace_back();
+                    para->getParD(level)->sendProcessNeighborY.emplace_back();
+                    para->getParH(level)->recvProcessNeighborY.emplace_back();
+                    para->getParD(level)->recvProcessNeighborY.emplace_back();
+                    if (para->getDiffOn() == true) {
+                        para->getParH(level)->sendProcessNeighborADY.emplace_back();
+                        para->getParD(level)->sendProcessNeighborADY.emplace_back();
+                        para->getParH(level)->recvProcessNeighborADY.emplace_back();
+                        para->getParD(level)->recvProcessNeighborADY.emplace_back();
+                    }
+
+                    int tempSend = builder->getNumberOfSendIndices(direction, level);
+                    int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
+                    if (tempSend > 0) {
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // send
+                        std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborY.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborY.back().numberOfNodes = tempSend;
+                        para->getParD(level)->sendProcessNeighborY.back().numberOfNodes = tempSend;
+                        para->getParH(level)->sendProcessNeighborY.back().numberOfFs    = para->getD3Qxx() * tempSend;
+                        para->getParD(level)->sendProcessNeighborY.back().numberOfFs    = para->getD3Qxx() * tempSend;
+                        para->getParH(level)->sendProcessNeighborY.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParD(level)->sendProcessNeighborY.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParH(level)->sendProcessNeighborY.back().memsizeFs = sizeof(real) * tempSend;
+                        para->getParD(level)->sendProcessNeighborY.back().memsizeFs = sizeof(real) * tempSend;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // recv
+                        std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborY.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborY.back().numberOfNodes = tempRecv;
+                        para->getParD(level)->recvProcessNeighborY.back().numberOfNodes = tempRecv;
+                        para->getParH(level)->recvProcessNeighborY.back().numberOfFs    = para->getD3Qxx() * tempRecv;
+                        para->getParD(level)->recvProcessNeighborY.back().numberOfFs    = para->getD3Qxx() * tempRecv;
+                        para->getParH(level)->recvProcessNeighborY.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborY.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParH(level)->recvProcessNeighborY.back().memsizeFs = sizeof(real) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborY.back().memsizeFs = sizeof(real) * tempRecv;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // malloc on host and device
                         cudaMemoryManager->cudaAllocProcessNeighborY(level, j);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //init index arrays
-                        if (level == builder->getNumberOfGridLevels() - 1) {
-                            builder->getSendIndices(para->getParH(level)->sendProcessNeighborY[j].index, direction,
-                                                    level);
-                            builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborY[j].index, direction,
-                                                       level);
-                        } else {
-                            para->initNumberOfProcessNeighborsAfterFtoCY(level);
-                            std::vector<uint> sendIndicesForCommAfterFtoCPositions = builder->getAndReorderSendIndices(
-                                para->getParH(level)->sendProcessNeighborY[j].index,
-                                para->getParH(level)->numberOfSendProcessNeighborsAfterFtoCY[j],
-                                para->getParH(level)->intFC.ICellFCC, para->getParH(level)->K_CF,
-                                para->getParH(level)->intCF.ICellCFC, para->getParH(level)->K_FC,
-                                para->getParH(level)->neighborX_SP, para->getParH(level)->neighborY_SP,
-                                para->getParH(level)->neighborZ_SP,
-                                direction, level);
-                            builder->getAndReorderReceiveIndices(
-                                para->getParH(level)->recvProcessNeighborY[j].index,
-                                para->getParH(level)->numberOfRecvProcessNeighborsAfterFtoCY[j],
-                                sendIndicesForCommAfterFtoCPositions, direction, level);
-                            para->getParD(level)->numberOfSendProcessNeighborsAfterFtoCY[j] =
-                                para->getParH(level)->numberOfSendProcessNeighborsAfterFtoCY[j];
-                            para->getParD(level)->numberOfRecvProcessNeighborsAfterFtoCY[j] =
-                                para->getParH(level)->numberOfRecvProcessNeighborsAfterFtoCY[j];
-                        }
-					    ////////////////////////////////////////////////////////////////////////////////////////
+                        ////////////////////////////////////////////////////////////////////////////////////////                        
+                        // init index arrays
+                        builder->getSendIndices(para->getParH(level)->sendProcessNeighborY[j].index, direction, level);
+                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborY[j].index, direction,
+                                                   level);
+                        if (level != builder->getNumberOfGridLevels() - 1)
+                            initCommunicationArraysForCommAfterFinetoCoarseY(level, j, direction);                       
+                        ////////////////////////////////////////////////////////////////////////////////////////
                         cudaMemoryManager->cudaCopyProcessNeighborYIndex(level, j);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-				    }
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                    }
                 }
-                
-                if( direction == CommunicationDirections::MZ || direction == CommunicationDirections::PZ )
-                {
+
+                if (direction == CommunicationDirections::MZ || direction == CommunicationDirections::PZ) {
                     int j = (int)para->getParH(level)->sendProcessNeighborZ.size();
 
-		            para->getParH(level)->sendProcessNeighborZ.emplace_back();
-		            para->getParD(level)->sendProcessNeighborZ.emplace_back();
-		            para->getParH(level)->recvProcessNeighborZ.emplace_back();
-		            para->getParD(level)->recvProcessNeighborZ.emplace_back();
-		            if (para->getDiffOn()==true){
-			            para->getParH(level)->sendProcessNeighborADZ.emplace_back();
-			            para->getParD(level)->sendProcessNeighborADZ.emplace_back();
-			            para->getParH(level)->recvProcessNeighborADZ.emplace_back();
-			            para->getParD(level)->recvProcessNeighborADZ.emplace_back();
-		            }
-
-				    int tempSend = builder->getNumberOfSendIndices( direction, level );
-				    int tempRecv = builder->getNumberOfReceiveIndices( direction, level );
-				    if (tempSend > 0)
-				    {
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //send
-					    std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend << std::endl;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->sendProcessNeighborZ.back().rankNeighbor = builder->getCommunicationProcess(direction);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->sendProcessNeighborZ.back().numberOfNodes = tempSend;
-					    para->getParD(level)->sendProcessNeighborZ.back().numberOfNodes = tempSend;
-					    para->getParH(level)->sendProcessNeighborZ.back().numberOfFs = para->getD3Qxx() * tempSend;
-					    para->getParD(level)->sendProcessNeighborZ.back().numberOfFs = para->getD3Qxx() * tempSend;
-					    para->getParH(level)->sendProcessNeighborZ.back().memsizeIndex = sizeof(unsigned int)*tempSend;
-					    para->getParD(level)->sendProcessNeighborZ.back().memsizeIndex = sizeof(unsigned int)*tempSend;
-					    para->getParH(level)->sendProcessNeighborZ.back().memsizeFs = sizeof(real)     *tempSend;
-					    para->getParD(level)->sendProcessNeighborZ.back().memsizeFs = sizeof(real)     *tempSend;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //recv
-					    std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv << std::endl;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->recvProcessNeighborZ.back().rankNeighbor = builder->getCommunicationProcess(direction);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->recvProcessNeighborZ.back().numberOfNodes = tempRecv;
-					    para->getParD(level)->recvProcessNeighborZ.back().numberOfNodes = tempRecv;
-					    para->getParH(level)->recvProcessNeighborZ.back().numberOfFs = para->getD3Qxx() * tempRecv;
-					    para->getParD(level)->recvProcessNeighborZ.back().numberOfFs = para->getD3Qxx() * tempRecv;
-					    para->getParH(level)->recvProcessNeighborZ.back().memsizeIndex = sizeof(unsigned int)*tempRecv;
-					    para->getParD(level)->recvProcessNeighborZ.back().memsizeIndex = sizeof(unsigned int)*tempRecv;
-					    para->getParH(level)->recvProcessNeighborZ.back().memsizeFs = sizeof(real)     *tempRecv;
-					    para->getParD(level)->recvProcessNeighborZ.back().memsizeFs = sizeof(real)     *tempRecv;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //malloc on host and device
+                    para->getParH(level)->sendProcessNeighborZ.emplace_back();
+                    para->getParD(level)->sendProcessNeighborZ.emplace_back();
+                    para->getParH(level)->recvProcessNeighborZ.emplace_back();
+                    para->getParD(level)->recvProcessNeighborZ.emplace_back();
+                    if (para->getDiffOn() == true) {
+                        para->getParH(level)->sendProcessNeighborADZ.emplace_back();
+                        para->getParD(level)->sendProcessNeighborADZ.emplace_back();
+                        para->getParH(level)->recvProcessNeighborADZ.emplace_back();
+                        para->getParD(level)->recvProcessNeighborADZ.emplace_back();
+                    }
+
+                    int tempSend = builder->getNumberOfSendIndices(direction, level);
+                    int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
+                    if (tempSend > 0) {
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // send
+                        std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborZ.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborZ.back().numberOfNodes = tempSend;
+                        para->getParD(level)->sendProcessNeighborZ.back().numberOfNodes = tempSend;
+                        para->getParH(level)->sendProcessNeighborZ.back().numberOfFs    = para->getD3Qxx() * tempSend;
+                        para->getParD(level)->sendProcessNeighborZ.back().numberOfFs    = para->getD3Qxx() * tempSend;
+                        para->getParH(level)->sendProcessNeighborZ.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParD(level)->sendProcessNeighborZ.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParH(level)->sendProcessNeighborZ.back().memsizeFs = sizeof(real) * tempSend;
+                        para->getParD(level)->sendProcessNeighborZ.back().memsizeFs = sizeof(real) * tempSend;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // recv
+                        std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborZ.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborZ.back().numberOfNodes = tempRecv;
+                        para->getParD(level)->recvProcessNeighborZ.back().numberOfNodes = tempRecv;
+                        para->getParH(level)->recvProcessNeighborZ.back().numberOfFs    = para->getD3Qxx() * tempRecv;
+                        para->getParD(level)->recvProcessNeighborZ.back().numberOfFs    = para->getD3Qxx() * tempRecv;
+                        para->getParH(level)->recvProcessNeighborZ.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborZ.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParH(level)->recvProcessNeighborZ.back().memsizeFs = sizeof(real) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborZ.back().memsizeFs = sizeof(real) * tempRecv;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // malloc on host and device
                         cudaMemoryManager->cudaAllocProcessNeighborZ(level, j);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //init index arrays
-                        if (level == builder->getNumberOfGridLevels() - 1) {
-                            builder->getSendIndices(para->getParH(level)->sendProcessNeighborZ[j].index, direction,
-                                                    level);
-                            builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborZ[j].index, direction,
-                                                       level);
-                        } else {
-                            para->initNumberOfProcessNeighborsAfterFtoCZ(level);
-                            std::vector<uint> sendIndicesForCommAfterFtoCPositions = builder->getAndReorderSendIndices(
-                                para->getParH(level)->sendProcessNeighborZ[j].index,
-                                para->getParH(level)->numberOfSendProcessNeighborsAfterFtoCZ[j],
-                                para->getParH(level)->intFC.ICellFCC, para->getParH(level)->K_CF,
-                                para->getParH(level)->intCF.ICellCFC, para->getParH(level)->K_FC,
-                                para->getParH(level)->neighborX_SP, para->getParH(level)->neighborY_SP,
-                                para->getParH(level)->neighborZ_SP,
-                                direction, level);
-                            builder->getAndReorderReceiveIndices(
-                                para->getParH(level)->recvProcessNeighborZ[j].index,
-                                para->getParH(level)->numberOfRecvProcessNeighborsAfterFtoCZ[j],
-                                sendIndicesForCommAfterFtoCPositions, direction, level);
-                            para->getParD(level)->numberOfSendProcessNeighborsAfterFtoCZ[j] =
-                                para->getParH(level)->numberOfSendProcessNeighborsAfterFtoCZ[j];
-                            para->getParD(level)->numberOfRecvProcessNeighborsAfterFtoCZ[j] =
-                                para->getParH(level)->numberOfRecvProcessNeighborsAfterFtoCZ[j];
-                        }
-					    ////////////////////////////////////////////////////////////////////////////////////////
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // init index arrays
+                        builder->getSendIndices(para->getParH(level)->sendProcessNeighborZ[j].index, direction, level);
+                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborZ[j].index, direction,
+                                                   level);
+                        if (level != builder->getNumberOfGridLevels() - 1)
+                            initCommunicationArraysForCommAfterFinetoCoarseZ(level, j, direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
                         cudaMemoryManager->cudaCopyProcessNeighborZIndex(level, j);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-				    }
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                    }
                 }
+            }
+        }
+    }
 
-			}
-		}
-	}
-
+    // data exchange for F3 / G6
+    if ((para->getNumprocs() > 1) && (para->getIsF3())) {
+        for (int direction = 0; direction < 6; direction++) {
+            if (builder->getCommunicationProcess(direction) == INVALID_INDEX)
+                continue;
 
-	// data exchange for F3 / G6
-	if ((para->getNumprocs() > 1) && (para->getIsF3()) )
-	{
-		for (int direction = 0; direction < 6; direction++)
-		{
-			if (builder->getCommunicationProcess(direction) == INVALID_INDEX) continue;
-
-			for (uint level = 0; level < builder->getNumberOfGridLevels(); level++)
-			{
-				if (direction == CommunicationDirections::MX || direction == CommunicationDirections::PX)
-				{
+            for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
+                if (direction == CommunicationDirections::MX || direction == CommunicationDirections::PX) {
                     int j = (int)para->getParH(level)->sendProcessNeighborF3X.size();
 
-					para->getParH(level)->sendProcessNeighborF3X.emplace_back();
-					para->getParD(level)->sendProcessNeighborF3X.emplace_back();
-					para->getParH(level)->recvProcessNeighborF3X.emplace_back();
-					para->getParD(level)->recvProcessNeighborF3X.emplace_back();
-
-					int tempSend = builder->getNumberOfSendIndices(direction, level);
-					int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
-					if (tempSend > 0)
-					{
-						////////////////////////////////////////////////////////////////////////////////////////
-						//send
-						std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend << std::endl;
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->sendProcessNeighborF3X.back().rankNeighbor = builder->getCommunicationProcess(direction);
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->sendProcessNeighborF3X.back().numberOfNodes = tempSend;
-						para->getParD(level)->sendProcessNeighborF3X.back().numberOfNodes = tempSend;
-						para->getParH(level)->sendProcessNeighborF3X.back().numberOfGs = 6 * tempSend;
-						para->getParD(level)->sendProcessNeighborF3X.back().numberOfGs = 6 * tempSend;
-						para->getParH(level)->sendProcessNeighborF3X.back().memsizeIndex = sizeof(unsigned int) * tempSend;
-						para->getParD(level)->sendProcessNeighborF3X.back().memsizeIndex = sizeof(unsigned int) * tempSend;
-						para->getParH(level)->sendProcessNeighborF3X.back().memsizeGs = sizeof(real) * para->getParH(level)->sendProcessNeighborF3X.back().numberOfGs;
-						para->getParD(level)->sendProcessNeighborF3X.back().memsizeGs = sizeof(real) * para->getParH(level)->sendProcessNeighborF3X.back().numberOfGs;
-						////////////////////////////////////////////////////////////////////////////////////////
-						//recv
-						std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv << std::endl;
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->recvProcessNeighborF3X.back().rankNeighbor = builder->getCommunicationProcess(direction);
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->recvProcessNeighborF3X.back().numberOfNodes = tempRecv;
-						para->getParD(level)->recvProcessNeighborF3X.back().numberOfNodes = tempRecv;
-						para->getParH(level)->recvProcessNeighborF3X.back().numberOfGs = 6 * tempRecv;
-						para->getParD(level)->recvProcessNeighborF3X.back().numberOfGs = 6 * tempRecv;
-						para->getParH(level)->recvProcessNeighborF3X.back().memsizeIndex = sizeof(unsigned int) * tempRecv;
-						para->getParD(level)->recvProcessNeighborF3X.back().memsizeIndex = sizeof(unsigned int) * tempRecv;
-						para->getParH(level)->recvProcessNeighborF3X.back().memsizeGs = sizeof(real) * para->getParH(level)->recvProcessNeighborF3X.back().numberOfGs;
-						para->getParD(level)->recvProcessNeighborF3X.back().memsizeGs = sizeof(real) * para->getParH(level)->recvProcessNeighborF3X.back().numberOfGs;
-						////////////////////////////////////////////////////////////////////////////////////////
-						//malloc on host and device
-						cudaMemoryManager->cudaAllocProcessNeighborF3X(level, j);
-						////////////////////////////////////////////////////////////////////////////////////////
-						//init index arrays
-						builder->getSendIndices(para->getParH(level)->sendProcessNeighborF3X[j].index, direction, level);
-						builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborF3X[j].index, direction, level);
-						////////////////////////////////////////////////////////////////////////////////////////
-						cudaMemoryManager->cudaCopyProcessNeighborF3XIndex(level, j);
-						////////////////////////////////////////////////////////////////////////////////////////
-					}
-				}
-
-				if (direction == CommunicationDirections::MY || direction == CommunicationDirections::PY)
-				{
+                    para->getParH(level)->sendProcessNeighborF3X.emplace_back();
+                    para->getParD(level)->sendProcessNeighborF3X.emplace_back();
+                    para->getParH(level)->recvProcessNeighborF3X.emplace_back();
+                    para->getParD(level)->recvProcessNeighborF3X.emplace_back();
+
+                    int tempSend = builder->getNumberOfSendIndices(direction, level);
+                    int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
+                    if (tempSend > 0) {
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // send
+                        std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborF3X.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborF3X.back().numberOfNodes = tempSend;
+                        para->getParD(level)->sendProcessNeighborF3X.back().numberOfNodes = tempSend;
+                        para->getParH(level)->sendProcessNeighborF3X.back().numberOfGs    = 6 * tempSend;
+                        para->getParD(level)->sendProcessNeighborF3X.back().numberOfGs    = 6 * tempSend;
+                        para->getParH(level)->sendProcessNeighborF3X.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParD(level)->sendProcessNeighborF3X.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParH(level)->sendProcessNeighborF3X.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->sendProcessNeighborF3X.back().numberOfGs;
+                        para->getParD(level)->sendProcessNeighborF3X.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->sendProcessNeighborF3X.back().numberOfGs;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // recv
+                        std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborF3X.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborF3X.back().numberOfNodes = tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3X.back().numberOfNodes = tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3X.back().numberOfGs    = 6 * tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3X.back().numberOfGs    = 6 * tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3X.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3X.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3X.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->recvProcessNeighborF3X.back().numberOfGs;
+                        para->getParD(level)->recvProcessNeighborF3X.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->recvProcessNeighborF3X.back().numberOfGs;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // malloc on host and device
+                        cudaMemoryManager->cudaAllocProcessNeighborF3X(level, j);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // init index arrays
+                        builder->getSendIndices(para->getParH(level)->sendProcessNeighborF3X[j].index, direction,
+                                                level);
+                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborF3X[j].index, direction,
+                                                   level);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        cudaMemoryManager->cudaCopyProcessNeighborF3XIndex(level, j);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                    }
+                }
+
+                if (direction == CommunicationDirections::MY || direction == CommunicationDirections::PY) {
                     int j = (int)para->getParH(level)->sendProcessNeighborF3Y.size();
 
-					para->getParH(level)->sendProcessNeighborF3Y.emplace_back();
-					para->getParD(level)->sendProcessNeighborF3Y.emplace_back();
-					para->getParH(level)->recvProcessNeighborF3Y.emplace_back();
-					para->getParD(level)->recvProcessNeighborF3Y.emplace_back();
-
-					int tempSend = builder->getNumberOfSendIndices(direction, level);
-					int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
-					if (tempSend > 0)
-					{
-						////////////////////////////////////////////////////////////////////////////////////////
-						//send
-						std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend << std::endl;
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->sendProcessNeighborF3Y.back().rankNeighbor = builder->getCommunicationProcess(direction);
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->sendProcessNeighborF3Y.back().numberOfNodes = tempSend;
-						para->getParD(level)->sendProcessNeighborF3Y.back().numberOfNodes = tempSend;
-						para->getParH(level)->sendProcessNeighborF3Y.back().numberOfGs = 6 * tempSend;
-						para->getParD(level)->sendProcessNeighborF3Y.back().numberOfGs = 6 * tempSend;
-						para->getParH(level)->sendProcessNeighborF3Y.back().memsizeIndex = sizeof(unsigned int) * tempSend;
-						para->getParD(level)->sendProcessNeighborF3Y.back().memsizeIndex = sizeof(unsigned int) * tempSend;
-						para->getParH(level)->sendProcessNeighborF3Y.back().memsizeGs = sizeof(real) * para->getParH(level)->sendProcessNeighborF3Y.back().numberOfGs;
-						para->getParD(level)->sendProcessNeighborF3Y.back().memsizeGs = sizeof(real) * para->getParH(level)->sendProcessNeighborF3Y.back().numberOfGs;
-						////////////////////////////////////////////////////////////////////////////////////////
-						//recv
-						std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv << std::endl;
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->recvProcessNeighborF3Y.back().rankNeighbor = builder->getCommunicationProcess(direction);
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->recvProcessNeighborF3Y.back().numberOfNodes = tempRecv;
-						para->getParD(level)->recvProcessNeighborF3Y.back().numberOfNodes = tempRecv;
-						para->getParH(level)->recvProcessNeighborF3Y.back().numberOfGs = 6 * tempRecv;
-						para->getParD(level)->recvProcessNeighborF3Y.back().numberOfGs = 6 * tempRecv;
-						para->getParH(level)->recvProcessNeighborF3Y.back().memsizeIndex = sizeof(unsigned int) * tempRecv;
-						para->getParD(level)->recvProcessNeighborF3Y.back().memsizeIndex = sizeof(unsigned int) * tempRecv;
-						para->getParH(level)->recvProcessNeighborF3Y.back().memsizeGs = sizeof(real) * para->getParH(level)->recvProcessNeighborF3Y.back().numberOfGs;
-						para->getParD(level)->recvProcessNeighborF3Y.back().memsizeGs = sizeof(real) * para->getParH(level)->recvProcessNeighborF3Y.back().numberOfGs;
-						////////////////////////////////////////////////////////////////////////////////////////
-						//malloc on host and device
-						cudaMemoryManager->cudaAllocProcessNeighborF3Y(level, j);
-						////////////////////////////////////////////////////////////////////////////////////////
-						//init index arrays
-						builder->getSendIndices(para->getParH(level)->sendProcessNeighborF3Y[j].index, direction, level);
-						builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborF3Y[j].index, direction, level);
-						////////////////////////////////////////////////////////////////////////////////////////
-						cudaMemoryManager->cudaCopyProcessNeighborF3YIndex(level, j);
-						////////////////////////////////////////////////////////////////////////////////////////
-					}
-				}
-
-				if (direction == CommunicationDirections::MZ || direction == CommunicationDirections::PZ)
-				{
+                    para->getParH(level)->sendProcessNeighborF3Y.emplace_back();
+                    para->getParD(level)->sendProcessNeighborF3Y.emplace_back();
+                    para->getParH(level)->recvProcessNeighborF3Y.emplace_back();
+                    para->getParD(level)->recvProcessNeighborF3Y.emplace_back();
+
+                    int tempSend = builder->getNumberOfSendIndices(direction, level);
+                    int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
+                    if (tempSend > 0) {
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // send
+                        std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborF3Y.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborF3Y.back().numberOfNodes = tempSend;
+                        para->getParD(level)->sendProcessNeighborF3Y.back().numberOfNodes = tempSend;
+                        para->getParH(level)->sendProcessNeighborF3Y.back().numberOfGs    = 6 * tempSend;
+                        para->getParD(level)->sendProcessNeighborF3Y.back().numberOfGs    = 6 * tempSend;
+                        para->getParH(level)->sendProcessNeighborF3Y.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParD(level)->sendProcessNeighborF3Y.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParH(level)->sendProcessNeighborF3Y.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->sendProcessNeighborF3Y.back().numberOfGs;
+                        para->getParD(level)->sendProcessNeighborF3Y.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->sendProcessNeighborF3Y.back().numberOfGs;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // recv
+                        std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborF3Y.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborF3Y.back().numberOfNodes = tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3Y.back().numberOfNodes = tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3Y.back().numberOfGs    = 6 * tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3Y.back().numberOfGs    = 6 * tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3Y.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3Y.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3Y.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->recvProcessNeighborF3Y.back().numberOfGs;
+                        para->getParD(level)->recvProcessNeighborF3Y.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->recvProcessNeighborF3Y.back().numberOfGs;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // malloc on host and device
+                        cudaMemoryManager->cudaAllocProcessNeighborF3Y(level, j);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // init index arrays
+                        builder->getSendIndices(para->getParH(level)->sendProcessNeighborF3Y[j].index, direction,
+                                                level);
+                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborF3Y[j].index, direction,
+                                                   level);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        cudaMemoryManager->cudaCopyProcessNeighborF3YIndex(level, j);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                    }
+                }
+
+                if (direction == CommunicationDirections::MZ || direction == CommunicationDirections::PZ) {
                     int j = (int)para->getParH(level)->sendProcessNeighborF3Z.size();
 
-					para->getParH(level)->sendProcessNeighborF3Z.emplace_back();
-					para->getParD(level)->sendProcessNeighborF3Z.emplace_back();
-					para->getParH(level)->recvProcessNeighborF3Z.emplace_back();
-					para->getParD(level)->recvProcessNeighborF3Z.emplace_back();
-
-					int tempSend = builder->getNumberOfSendIndices(direction, level);
-					int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
-					if (tempSend > 0)
-					{
-						////////////////////////////////////////////////////////////////////////////////////////
-						//send
-						std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend << std::endl;
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->sendProcessNeighborF3Z.back().rankNeighbor = builder->getCommunicationProcess(direction);
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->sendProcessNeighborF3Z.back().numberOfNodes = tempSend;
-						para->getParD(level)->sendProcessNeighborF3Z.back().numberOfNodes = tempSend;
-						para->getParH(level)->sendProcessNeighborF3Z.back().numberOfGs = 6 * tempSend;
-						para->getParD(level)->sendProcessNeighborF3Z.back().numberOfGs = 6 * tempSend;
-						para->getParH(level)->sendProcessNeighborF3Z.back().memsizeIndex = sizeof(unsigned int) * tempSend;
-						para->getParD(level)->sendProcessNeighborF3Z.back().memsizeIndex = sizeof(unsigned int) * tempSend;
-						para->getParH(level)->sendProcessNeighborF3Z.back().memsizeGs = sizeof(real) * para->getParH(level)->sendProcessNeighborF3Z.back().numberOfGs;
-						para->getParD(level)->sendProcessNeighborF3Z.back().memsizeGs = sizeof(real) * para->getParH(level)->sendProcessNeighborF3Z.back().numberOfGs;
-						////////////////////////////////////////////////////////////////////////////////////////
-						//recv
-						std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv << std::endl;
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->recvProcessNeighborF3Z.back().rankNeighbor = builder->getCommunicationProcess(direction);
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->recvProcessNeighborF3Z.back().numberOfNodes = tempRecv;
-						para->getParD(level)->recvProcessNeighborF3Z.back().numberOfNodes = tempRecv;
-						para->getParH(level)->recvProcessNeighborF3Z.back().numberOfGs = 6 * tempRecv;
-						para->getParD(level)->recvProcessNeighborF3Z.back().numberOfGs = 6 * tempRecv;
-						para->getParH(level)->recvProcessNeighborF3Z.back().memsizeIndex = sizeof(unsigned int) * tempRecv;
-						para->getParD(level)->recvProcessNeighborF3Z.back().memsizeIndex = sizeof(unsigned int) * tempRecv;
-						para->getParH(level)->recvProcessNeighborF3Z.back().memsizeGs = sizeof(real) * para->getParH(level)->recvProcessNeighborF3Z.back().numberOfGs;
-						para->getParD(level)->recvProcessNeighborF3Z.back().memsizeGs = sizeof(real) * para->getParH(level)->recvProcessNeighborF3Z.back().numberOfGs;
-						////////////////////////////////////////////////////////////////////////////////////////
-						//malloc on host and device
-						cudaMemoryManager->cudaAllocProcessNeighborF3Z(level, j);
-						////////////////////////////////////////////////////////////////////////////////////////
-						//init index arrays
-						builder->getSendIndices(para->getParH(level)->sendProcessNeighborF3Z[j].index, direction, level);
-						builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborF3Z[j].index, direction, level);
-						////////////////////////////////////////////////////////////////////////////////////////
-						cudaMemoryManager->cudaCopyProcessNeighborF3ZIndex(level, j);
-						////////////////////////////////////////////////////////////////////////////////////////
-					}
-				}
-
-			}
-		}
-	}
+                    para->getParH(level)->sendProcessNeighborF3Z.emplace_back();
+                    para->getParD(level)->sendProcessNeighborF3Z.emplace_back();
+                    para->getParH(level)->recvProcessNeighborF3Z.emplace_back();
+                    para->getParD(level)->recvProcessNeighborF3Z.emplace_back();
+
+                    int tempSend = builder->getNumberOfSendIndices(direction, level);
+                    int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
+                    if (tempSend > 0) {
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // send
+                        std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborF3Z.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborF3Z.back().numberOfNodes = tempSend;
+                        para->getParD(level)->sendProcessNeighborF3Z.back().numberOfNodes = tempSend;
+                        para->getParH(level)->sendProcessNeighborF3Z.back().numberOfGs    = 6 * tempSend;
+                        para->getParD(level)->sendProcessNeighborF3Z.back().numberOfGs    = 6 * tempSend;
+                        para->getParH(level)->sendProcessNeighborF3Z.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParD(level)->sendProcessNeighborF3Z.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParH(level)->sendProcessNeighborF3Z.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->sendProcessNeighborF3Z.back().numberOfGs;
+                        para->getParD(level)->sendProcessNeighborF3Z.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->sendProcessNeighborF3Z.back().numberOfGs;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // recv
+                        std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborF3Z.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborF3Z.back().numberOfNodes = tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3Z.back().numberOfNodes = tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3Z.back().numberOfGs    = 6 * tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3Z.back().numberOfGs    = 6 * tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3Z.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3Z.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3Z.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->recvProcessNeighborF3Z.back().numberOfGs;
+                        para->getParD(level)->recvProcessNeighborF3Z.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->recvProcessNeighborF3Z.back().numberOfGs;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // malloc on host and device
+                        cudaMemoryManager->cudaAllocProcessNeighborF3Z(level, j);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // init index arrays
+                        builder->getSendIndices(para->getParH(level)->sendProcessNeighborF3Z[j].index, direction,
+                                                level);
+                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborF3Z[j].index, direction,
+                                                   level);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        cudaMemoryManager->cudaCopyProcessNeighborF3ZIndex(level, j);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                    }
+                }
+            }
+        }
+    }
+}
 
+void GridGenerator::initCommunicationArraysForCommAfterFinetoCoarseZ(const uint &level, int j, int direction)
+{
+    para->initNumberOfProcessNeighborsAfterFtoCZ(level);
+    std::vector<uint> sendIndicesForCommAfterFtoCPositions;
+    builder->reorderSendIndicesForCommAfterFtoC(
+        para->getParH(level)->sendProcessNeighborZ[j].index,
+        para->getParH(level)->numberOfSendProcessNeighborsAfterFtoCZ[j], para->getParH(level)->intFC.ICellFCC,
+        para->getParH(level)->K_CF, para->getParH(level)->intCF.ICellCFC, para->getParH(level)->K_FC,
+        para->getParH(level)->neighborX_SP, para->getParH(level)->neighborY_SP, para->getParH(level)->neighborZ_SP,
+        direction, level, sendIndicesForCommAfterFtoCPositions);
+    builder->reorderRecvIndicesForCommAfterFtoC(para->getParH(level)->recvProcessNeighborZ[j].index,
+                                                para->getParH(level)->numberOfRecvProcessNeighborsAfterFtoCZ[j],
+                                                sendIndicesForCommAfterFtoCPositions, direction, level);
+    para->getParD(level)->numberOfSendProcessNeighborsAfterFtoCZ[j] =
+        para->getParH(level)->numberOfSendProcessNeighborsAfterFtoCZ[j];
+    para->getParD(level)->numberOfRecvProcessNeighborsAfterFtoCZ[j] =
+        para->getParH(level)->numberOfRecvProcessNeighborsAfterFtoCZ[j];
+}
+
+void GridGenerator::initCommunicationArraysForCommAfterFinetoCoarseY(const uint &level, int j, int direction)
+{
+    para->initNumberOfProcessNeighborsAfterFtoCY(level);
+    std::vector<uint> sendIndicesForCommAfterFtoCPositions;
+    builder->reorderSendIndicesForCommAfterFtoC(
+        para->getParH(level)->sendProcessNeighborY[j].index,
+        para->getParH(level)->numberOfSendProcessNeighborsAfterFtoCY[j], para->getParH(level)->intFC.ICellFCC,
+        para->getParH(level)->K_CF, para->getParH(level)->intCF.ICellCFC, para->getParH(level)->K_FC,
+        para->getParH(level)->neighborX_SP, para->getParH(level)->neighborY_SP, para->getParH(level)->neighborZ_SP,
+        direction, level, sendIndicesForCommAfterFtoCPositions);
+    builder->reorderRecvIndicesForCommAfterFtoC(para->getParH(level)->recvProcessNeighborY[j].index,
+                                                para->getParH(level)->numberOfRecvProcessNeighborsAfterFtoCY[j],
+                                                sendIndicesForCommAfterFtoCPositions, direction, level);
+    para->getParD(level)->numberOfSendProcessNeighborsAfterFtoCY[j] =
+        para->getParH(level)->numberOfSendProcessNeighborsAfterFtoCY[j];
+    para->getParD(level)->numberOfRecvProcessNeighborsAfterFtoCY[j] =
+        para->getParH(level)->numberOfRecvProcessNeighborsAfterFtoCY[j];
+}
+
+void GridGenerator::initCommunicationArraysForCommAfterFinetoCoarseX(const uint &level, int j, int direction)
+{
+    para->initNumberOfProcessNeighborsAfterFtoCX(level);
+    std::vector<uint> sendIndicesForCommAfterFtoCPositions;
+    builder->reorderSendIndicesForCommAfterFtoC(
+        para->getParH(level)->sendProcessNeighborX[j].index,
+        para->getParH(level)->numberOfSendProcessNeighborsAfterFtoCX[j], para->getParH(level)->intFC.ICellFCC,
+        para->getParH(level)->K_CF, para->getParH(level)->intCF.ICellCFC, para->getParH(level)->K_FC,
+        para->getParH(level)->neighborX_SP, para->getParH(level)->neighborY_SP, para->getParH(level)->neighborZ_SP,
+        direction, level, sendIndicesForCommAfterFtoCPositions);
+    builder->reorderRecvIndicesForCommAfterFtoC(para->getParH(level)->recvProcessNeighborX[j].index,
+                                                para->getParH(level)->numberOfRecvProcessNeighborsAfterFtoCX[j],
+                                                sendIndicesForCommAfterFtoCPositions, direction, level);
+    para->getParD(level)->numberOfSendProcessNeighborsAfterFtoCX[j] =
+        para->getParH(level)->numberOfSendProcessNeighborsAfterFtoCX[j];
+    para->getParD(level)->numberOfRecvProcessNeighborsAfterFtoCX[j] =
+        para->getParH(level)->numberOfRecvProcessNeighborsAfterFtoCX[j];
 }
 
 
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h
index 5659cad85..0845dafeb 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h
@@ -26,7 +26,11 @@ public:
 	VIRTUALFLUIDS_GPU_EXPORT virtual ~GridGenerator();
 
 	void allocArrays_CoordNeighborGeo() override;
-	void allocArrays_BoundaryValues() override;
+    void allocArrays_BoundaryValues() override;
+    void initalValuesDomainDecompostion();
+    void initCommunicationArraysForCommAfterFinetoCoarseZ(const uint &level, int j, int direction);
+    void initCommunicationArraysForCommAfterFinetoCoarseY(const uint &level, int j, int direction);
+    void initCommunicationArraysForCommAfterFinetoCoarseX(const uint &level, int j, int direction);
 	void allocArrays_BoundaryQs() override;
     void allocArrays_OffsetScale() override;
     void allocArrays_fluidNodeIndices() override;
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
index 4a7556499..584e919c3 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
@@ -138,10 +138,7 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
    /////////////////////////////////////////////////////////////////////////
    cudaManager->setMemsizeGPU(0, true);
    //////////////////////////////////////////////////////////////////////////
-   gridProvider->allocArrays_CoordNeighborGeo();
-   gridProvider->allocArrays_OffsetScale();
-   gridProvider->allocArrays_BoundaryValues();
-   gridProvider->allocArrays_BoundaryQs();
+   allocNeighborsOffsetsScalesAndBoundaries(gridProvider);
 
    //////////////////////////////////////////////////////////////////////////
    //Kernel init
@@ -378,6 +375,14 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
    //InterfaceDebugWriter::writeInterfaceLinesDebugFC(para.get());
 }
 
+void Simulation::allocNeighborsOffsetsScalesAndBoundaries(SPtr<GridProvider> &gridProvider)
+{
+    gridProvider->allocArrays_CoordNeighborGeo();
+    gridProvider->allocArrays_OffsetScale();
+    gridProvider->allocArrays_BoundaryValues(); // allocArrays_BoundaryValues() has to be called after allocArrays_OffsetScale() because of initCommunicationArraysForCommAfterFinetoCoarse() 
+    gridProvider->allocArrays_BoundaryQs();
+}
+
 void Simulation::bulk()
 {
 
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.h b/src/gpu/VirtualFluids_GPU/LBM/Simulation.h
index d6c6702c4..ea7b59daa 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.h
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.h
@@ -39,7 +39,9 @@ class VIRTUALFLUIDS_GPU_EXPORT Simulation
 {
 public:
 	void run();
-	void init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std::shared_ptr<DataWriter> dataWriter, std::shared_ptr<CudaMemoryManager> cudaManager);
+    void init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std::shared_ptr<DataWriter> dataWriter,
+              std::shared_ptr<CudaMemoryManager> cudaManager);
+    void allocNeighborsOffsetsScalesAndBoundaries(SPtr<GridProvider> &gridProvider);
 	void free();
 	void bulk();
 	void porousMedia();
-- 
GitLab