diff --git a/src/GksGpu/BoundaryConditions/BoundaryCondition.cpp b/src/GksGpu/BoundaryConditions/BoundaryCondition.cpp
index 94e1f02382ab455ab49a094f3499e8817ded90a8..c7e5a6ee0cf0cb247a354d78dcede9493de3faaf 100644
--- a/src/GksGpu/BoundaryConditions/BoundaryCondition.cpp
+++ b/src/GksGpu/BoundaryConditions/BoundaryCondition.cpp
@@ -54,6 +54,8 @@ void BoundaryCondition::findBoundaryCells(GksMeshAdapter & adapter, bool allowGh
 
             if( cell.type != STOPPER_OUT_OF_GRID && cell.type != STOPPER_OUT_OF_GRID_BOUNDARY && cell.type != STOPPER_SOLID ) continue;
 
+            if( cell.isRecvCell ) continue;
+
             // look in all directions
             uint maximalSearchDirection = 27;
 
diff --git a/src/GksGpu/DataBase/DataBase.cpp b/src/GksGpu/DataBase/DataBase.cpp
index a1bed9392d1b00e73d82d1549416d8d45e5d5557..36b6194b8f2ae65230259106e05697e6f30daaa1 100644
--- a/src/GksGpu/DataBase/DataBase.cpp
+++ b/src/GksGpu/DataBase/DataBase.cpp
@@ -9,7 +9,7 @@
 #include "DataBaseAllocator.h"
 #include "DataBaseStruct.h"
 
-#include "core/Logger/Logger.h"
+#include "Core/Logger/Logger.h"
 
 #include "GksMeshAdapter/GksMeshAdapter.h"
 #include "Communication/Communicator.h"
diff --git a/src/GridGenerator/grid/GridImp.cu b/src/GridGenerator/grid/GridImp.cu
index 791f3808ad2db1b869e040f97967ad783c123aef..3d1ec576b8a2609c53dd5c1995da8e5f8fe5f151 100644
--- a/src/GridGenerator/grid/GridImp.cu
+++ b/src/GridGenerator/grid/GridImp.cu
@@ -1415,11 +1415,12 @@ void GridImp::findCommunicationIndices(int direction, SPtr<BoundingBox> subDomai
         if( this->getFieldEntry(index) == INVALID_OUT_OF_GRID ||
             this->getFieldEntry(index) == INVALID_SOLID ||
             this->getFieldEntry(index) == INVALID_COARSE_UNDER_FINE ||
-            this->getFieldEntry(index) == STOPPER_SOLID ||
+            //this->getFieldEntry(index) == STOPPER_SOLID ||
             this->getFieldEntry(index) == STOPPER_OUT_OF_GRID ||
             this->getFieldEntry(index) == STOPPER_COARSE_UNDER_FINE ) continue;
 
         if( lbmOrGks == LBM && this->getFieldEntry(index) == STOPPER_OUT_OF_GRID_BOUNDARY ) continue;
+        if( lbmOrGks == LBM && this->getFieldEntry(index) == STOPPER_SOLID ) continue;
 
         if( direction == CommunicationDirections::MX ) findCommunicationIndex( index, x, subDomainBox->minX, direction);
         if( direction == CommunicationDirections::PX ) findCommunicationIndex( index, x, subDomainBox->maxX, direction);
diff --git a/targets/apps/GKS/MultiGPU_nD/MultiGPU_nD.cpp b/targets/apps/GKS/MultiGPU_nD/MultiGPU_nD.cpp
index f44ce0270b436c8dbf718d0f2b677b4c1767b8f7..a9309a2cfd8613f754508c5a3816e1be37da0a27 100644
--- a/targets/apps/GKS/MultiGPU_nD/MultiGPU_nD.cpp
+++ b/targets/apps/GKS/MultiGPU_nD/MultiGPU_nD.cpp
@@ -306,8 +306,8 @@ void performanceTest( std::string path, std::string simulationName, uint decompo
     {
         real U = 0.1;
 
-        real ULocal =   /*0.1*/ + U * sin( 2.0 * M_PI * cellCenter.x ) * cos( 2.0 * M_PI * cellCenter.y ) * cos( 2.0 * M_PI * cellCenter.z );
-        real VLocal =   /*0.1*/ - U * cos( 2.0 * M_PI * cellCenter.x ) * sin( 2.0 * M_PI * cellCenter.y ) * cos( 2.0 * M_PI * cellCenter.z );
+        real ULocal =   0.1 + U * sin( 2.0 * M_PI * cellCenter.x ) * cos( 2.0 * M_PI * cellCenter.y ) * cos( 2.0 * M_PI * cellCenter.z );
+        real VLocal =   0.1 - U * cos( 2.0 * M_PI * cellCenter.x ) * sin( 2.0 * M_PI * cellCenter.y ) * cos( 2.0 * M_PI * cellCenter.z );
         real WLocal =   0.1;
 
         real rho = 1.0;
@@ -337,11 +337,11 @@ void performanceTest( std::string path, std::string simulationName, uint decompo
 
     Initializer::initializeDataUpdate(dataBase);
 
-    //dataBase->copyDataDeviceToHost();
+    dataBase->copyDataDeviceToHost();
 
-    //if( rank == 0 ) writeVtkXMLParallelSummaryFile( dataBase, parameters, path + simulationName + "_0", mpiWorldSize );
+    if( rank == 0 ) writeVtkXMLParallelSummaryFile( dataBase, parameters, path + simulationName + "_0", mpiWorldSize );
 
-    //writeVtkXML( dataBase, parameters, 0, path + simulationName + "_0" + "_rank_" + std::to_string(rank) );
+    writeVtkXML( dataBase, parameters, 0, path + simulationName + "_0" + "_rank_" + std::to_string(rank) );
     
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -365,11 +365,11 @@ void performanceTest( std::string path, std::string simulationName, uint decompo
 
     //////////////////////////////////////////////////////////////////////////
 
-    //dataBase->copyDataDeviceToHost();
+    dataBase->copyDataDeviceToHost();
 
-    //if( rank == 0 ) writeVtkXMLParallelSummaryFile( dataBase, parameters, path + simulationName + "_final", mpiWorldSize );
+    if( rank == 0 ) writeVtkXMLParallelSummaryFile( dataBase, parameters, path + simulationName + "_final", mpiWorldSize );
 
-    //writeVtkXML( dataBase, parameters, 0, path + simulationName + "_final_rank_" + std::to_string(rank) );
+    writeVtkXML( dataBase, parameters, 0, path + simulationName + "_final_rank_" + std::to_string(rank) );
     
     //////////////////////////////////////////////////////////////////////////
 
diff --git a/targets/apps/GKS/RoomFireExtended/RoomFireExtended.cpp b/targets/apps/GKS/RoomFireExtended/RoomFireExtended.cpp
index c4a6c4167a85d45afef468756fe6a63dedb3ae6a..8df6389537c6167cea7ba8060d4d57bb0bc7ca9d 100644
--- a/targets/apps/GKS/RoomFireExtended/RoomFireExtended.cpp
+++ b/targets/apps/GKS/RoomFireExtended/RoomFireExtended.cpp
@@ -3,6 +3,7 @@
 #define _USE_MATH_DEFINES
 #include <math.h>
 #include <string>
+#include <sstream>
 #include <iostream>
 #include <exception>
 #include <fstream>
@@ -188,6 +189,8 @@ void thermalCavity( std::string path, std::string simulationName, uint windowInd
 
     gridBuilder->addCoarseGrid(-2.1, -1.6, -0.1,  
                                 2.1,  6.0,  5.0, dx);
+    //gridBuilder->addCoarseGrid(-1.1, -1.2,  1.1,  
+                                //1.2,  1.2,  2.2, dx);
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -228,41 +231,7 @@ void thermalCavity( std::string path, std::string simulationName, uint windowInd
     Cuboid boxCoarse ( -2.0, -3.0, -0.5, 
                         3.0,  3.0,  3.5 );
 
-    //gridBuilder->addGrid( &boxCoarse, 1 );
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    Cuboid roomRef( -2.1, -1.8, -1.0, 
-                     2.1,  1.7, 10.0 );
-    
-    Cuboid windowRef( -1.1,  1.6,  0.9, 
-                       1.1,  2.0,  3.0 );
-
-    Conglomerate refRegion1;
-
-    refRegion1.add( &roomRef );
-    refRegion1.add( &windowRef );
-
-    gridBuilder->setNumberOfLayers(0,22);
-
-    //gridBuilder->addGrid( &refRegion1, 2 );
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    Cuboid boxRef ( -0.6 * LBurner, -0.6 * LBurner, -1.0, 
-                     0.6 * LBurner,  0.6 * LBurner, 10.0 );
-    Cuboid beamRef( -10.0, -0.25, 2.4, 10.0, 0.25, 10.0 );
-
-    Conglomerate refRegion2;
-
-    refRegion2.add( &boxRef );
-    refRegion2.add( &beamRef );
-
-    gridBuilder->setNumberOfLayers(0,22);
-
-    //gridBuilder->addGrid( &refRegion2, 3 );
-
-    uint maxLevel = gridBuilder->getNumberOfGridLevels() - 1;
+    gridBuilder->addGrid( &boxCoarse, 1 );
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -303,10 +272,46 @@ void thermalCavity( std::string path, std::string simulationName, uint windowInd
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
+    Cuboid roomRef( -2.1, -1.8, -1.0, 
+                     2.1,  1.7, 10.0 );
+    
+    Cuboid windowRef( -1.1,  1.6,  0.9, 
+                       1.1,  2.0,  3.0 );
+
+    Conglomerate refRegion1;
+
+    refRegion1.add( &roomRef );
+    refRegion1.add( &windowRef );
+
+    gridBuilder->setNumberOfLayers(0,22);
+
+    //gridBuilder->addGrid( &refRegion1, 2 );
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    Cuboid boxRef ( -0.6 * LBurner, -0.6 * LBurner, -1.0, 
+                     0.6 * LBurner,  0.6 * LBurner, 10.0 );
+    Cuboid beamRef( -10.0, -0.25, 2.4, 10.0, 0.25, 10.0 );
+
+    Conglomerate refRegion2;
+
+    refRegion2.add( &boxRef );
+    refRegion2.add( &beamRef );
+
+    gridBuilder->setNumberOfLayers(0,22);
+
+    //gridBuilder->addGrid( &refRegion2, 3 );
+
+    uint maxLevel = gridBuilder->getNumberOfGridLevels() - 1;
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
     gridBuilder->setPeriodicBoundaryCondition(false, false, false);
 
     gridBuilder->buildGrids(GKS, false);
 
+    MPI_Barrier(MPI_COMM_WORLD);
+
     //gridBuilder->writeGridsToVtk(path + "Grid_rank_" + std::to_string( rank ) + "_lev_");
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -358,7 +363,7 @@ void thermalCavity( std::string path, std::string simulationName, uint windowInd
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    CudaUtility::setCudaDevice( rank % CudaUtility::getCudaDeviceCount() );
+    //CudaUtility::setCudaDevice( rank % CudaUtility::getCudaDeviceCount() );
 
     auto dataBase = std::make_shared<DataBase>( "GPU" );
 
@@ -407,8 +412,6 @@ void thermalCavity( std::string path, std::string simulationName, uint windowInd
 
     dataBase->boundaryConditions.push_back( bcWall );
 
-    //dataBase->boundaryConditions.push_back( bcTop );
-
     dataBase->boundaryConditions.push_back( bcOpen );
 
     dataBase->boundaryConditions.push_back( bcPressure );
@@ -457,7 +460,11 @@ void thermalCavity( std::string path, std::string simulationName, uint windowInd
     {
         Initializer::interpret(dataBase, [&](Vec3 cellCenter) -> ConservedVariables {
 
-            return toConservedVariables(prim, parameters.K);
+            PrimitiveVariables primLocal = prim;
+
+            //if( cellCenter.x > 0 ) primLocal.rho = 1.21;
+
+            return toConservedVariables(primLocal, parameters.K);
         });
 
         if (rank == 0) writeVtkXMLParallelSummaryFile(dataBase, parameters, path + simulationName + "_0", mpiWorldSize);
@@ -502,6 +509,8 @@ void thermalCavity( std::string path, std::string simulationName, uint windowInd
     *logging::out << logging::Logger::INFO_HIGH << "================================================================================\n";
     *logging::out << logging::Logger::INFO_HIGH << "================================================================================\n";
 
+    MPI_Barrier(MPI_COMM_WORLD);
+
     cupsAnalyzer.start();
 
     for( uint iter = startIter + 1; iter <= 100000000; iter++ )
@@ -524,7 +533,7 @@ void thermalCavity( std::string path, std::string simulationName, uint windowInd
 
         //////////////////////////////////////////////////////////////////////////
 
-        pointTimeSeriesCollector->run(iter, parameters);
+        //pointTimeSeriesCollector->run(iter, parameters);
 
         int crashCellIndex = dataBase->getCrashCellIndex();
         if( crashCellIndex >= 0 )
@@ -536,7 +545,7 @@ void thermalCavity( std::string path, std::string simulationName, uint windowInd
             break;
         }
 
-        if( iter % 100 == 0 )
+        if( iter % 1000 == 0 )
         {
             dataBase->copyDataDeviceToHost();
 
@@ -599,6 +608,22 @@ int main( int argc, char* argv[])
     logging::Logger::setDebugLevel(logging::Logger::Level::INFO_LOW);
     logging::Logger::timeStamp(logging::Logger::ENABLE);
 
+    //////////////////////////////////////////////////////////////////////////
+
+    // Important: for Cuda-Aware MPI the device must be set before MPI_Init()
+    int deviceCount = CudaUtility::getCudaDeviceCount();
+
+    if(deviceCount == 0)
+    {
+        std::stringstream msg;
+        msg << "No devices devices found!" << std::endl;
+        *logging::out << logging::Logger::WARNING << msg.str(); msg.str("");
+    }
+
+    CudaUtility::setCudaDevice( rank % deviceCount );
+
+    //////////////////////////////////////////////////////////////////////////
+
     if( sizeof(real) == 4 )
         *logging::out << logging::Logger::INFO_HIGH << "Using Single Precision\n";
     else
@@ -633,6 +658,8 @@ int main( int argc, char* argv[])
 
     logFile.close();
 
+    MPI_Finalize();
+
     return 0;
 }