From b6fddddaa056b83297b451e38084155b1fa22ac8 Mon Sep 17 00:00:00 2001
From: "LEGOLAS\\lenz" <lenz@irmb.tu-bs.de>
Date: Thu, 18 Jul 2019 16:43:39 +0200
Subject: [PATCH] clean up of communication hiding

---
 CMakeLists.txt                                |   2 +-
 .../BoundaryConditions/AdiabaticWall.cu       |   2 +-
 src/GksGpu/BoundaryConditions/Periodic.cu     |   2 -
 src/GksGpu/Communication/Communicator.cpp     |  57 +--
 src/GksGpu/Communication/Communicator.cu      |   4 -
 src/GksGpu/Communication/Communicator.h       |  13 +-
 src/GksGpu/CudaUtility/CudaUtility.cpp        |   9 +-
 src/GksGpu/CudaUtility/CudaUtility.h          |   2 -
 src/GksGpu/DataBase/DataBaseAllocatorGPU.cpp  |  12 +-
 src/GksGpu/TimeStepping/NestedTimeStep.cpp    |  56 +--
 targets/apps/GKS/MultiGPU/MultiGPU.cpp        |   6 +-
 targets/apps/GKS/MultiGPU_2D/MultiGPU_2D.cpp  | 328 +++++++++---------
 12 files changed, 218 insertions(+), 275 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 484ce62de..37aa443bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -180,7 +180,7 @@ IF (HULC.BUILD_VF_GKS)
     #add_subdirectory(targets/apps/GKS/Candle)
     
     add_subdirectory(targets/apps/GKS/MultiGPU)
-    #add_subdirectory(targets/apps/GKS/MultiGPU_2D)
+    add_subdirectory(targets/apps/GKS/MultiGPU_2D)
 ELSE()
   MESSAGE( STATUS "exclude Virtual Fluids GKS." )
 ENDIF()
diff --git a/src/GksGpu/BoundaryConditions/AdiabaticWall.cu b/src/GksGpu/BoundaryConditions/AdiabaticWall.cu
index eff77e7fd..642dcbe80 100644
--- a/src/GksGpu/BoundaryConditions/AdiabaticWall.cu
+++ b/src/GksGpu/BoundaryConditions/AdiabaticWall.cu
@@ -54,7 +54,7 @@ void AdiabaticWall::runBoundaryConditionKernel(const SPtr<DataBase> dataBase,
                parameters,
                this->startOfCellsPerLevel[ level ] );
 
-    cudaDeviceSynchronize();
+    //cudaDeviceSynchronize();
 
     getLastCudaError("AdiabaticWall::runBoundaryConditionKernel( const SPtr<DataBase> dataBase, const Parameters parameters, const uint level )");
 }
diff --git a/src/GksGpu/BoundaryConditions/Periodic.cu b/src/GksGpu/BoundaryConditions/Periodic.cu
index 46ce82874..802054854 100644
--- a/src/GksGpu/BoundaryConditions/Periodic.cu
+++ b/src/GksGpu/BoundaryConditions/Periodic.cu
@@ -56,8 +56,6 @@ void Periodic::runBoundaryConditionKernel(const SPtr<DataBase> dataBase,
                parameters,
                this->startOfCellsPerLevel[ level ] );
 
-    cudaDeviceSynchronize();
-
     getLastCudaError("Periodic::runBoundaryConditionKernel( const SPtr<DataBase> dataBase, const Parameters parameters, const uint level )");
 }
 
diff --git a/src/GksGpu/Communication/Communicator.cpp b/src/GksGpu/Communication/Communicator.cpp
index bd5d4e31a..1e4d07f33 100644
--- a/src/GksGpu/Communication/Communicator.cpp
+++ b/src/GksGpu/Communication/Communicator.cpp
@@ -22,6 +22,9 @@
 
 #include "CudaUtility/CudaUtility.h"
 
+int Communicator::tagSendPositive = 0;
+int Communicator::tagSendNegative = 1;
+
 Communicator::Communicator( SPtr<DataBase> dataBase )
     : myAllocator ( dataBase->myAllocator )
 {
@@ -51,84 +54,42 @@ void Communicator::initialize(GksMeshAdapter & adapter, uint level, uint directi
     this->opposingRank = adapter.communicationProcesses[direction];
 }
 
-void Communicator::exchangeData( SPtr<DataBase> dataBase )
+void Communicator::sendData( SPtr<DataBase> dataBase, int tag )
 {
-    int rank;
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-    MPI_Wait( &this->sendBufferIsReady, MPI_STATUSES_IGNORE );
-
 #ifdef USE_CUDA_AWARE_MPI
 
     this->copyFromMeshToSendBuffer( dataBase );
     
-    MPI_Isend( this->sendBuffer, this->numberOfSendNodes * LENGTH_CELL_DATA, MPI_TYPE_GPU, this->opposingRank, 0, MPI_COMM_WORLD, &this->sendBufferIsReady );
-    
-    MPI_Recv ( this->recvBuffer, this->numberOfRecvNodes * LENGTH_CELL_DATA, MPI_TYPE_GPU, this->opposingRank, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE );
-    
-    this->copyFromRecvBufferToMesh( dataBase );
+    MPI_Isend( this->sendBuffer, this->numberOfSendNodes * LENGTH_CELL_DATA, MPI_TYPE_GPU, this->opposingRank, tag, MPI_COMM_WORLD, &this->sendBufferIsReady );
 
 #else // USE_CUDA_AWARE_MPI
 
     this->copyFromMeshToSendBuffer( dataBase );
-    
-    this->myAllocator->copyBuffersDeviceToHost( shared_from_this() );
-    
-    MPI_Isend( this->sendBufferHost, this->numberOfSendNodes * LENGTH_CELL_DATA, MPI_TYPE_GPU, this->opposingRank, 0, MPI_COMM_WORLD, &this->sendBufferIsReady );
-    
-    MPI_Recv ( this->recvBufferHost, this->numberOfRecvNodes * LENGTH_CELL_DATA, MPI_TYPE_GPU, this->opposingRank, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE );
-    
-    this->myAllocator->copyBuffersHostToDevice( shared_from_this() );
-    
-    this->copyFromRecvBufferToMesh( dataBase );
-
-#endif // USE_CUDA_AWARE_MPI
-}
-
-void Communicator::sendData( SPtr<DataBase> dataBase )
-{
-#ifdef USE_CUDA_AWARE_MPI
-
-    this->copyFromMeshToSendBuffer( dataBase );
-    
-    MPI_Isend( this->sendBuffer, this->numberOfSendNodes * LENGTH_CELL_DATA, MPI_TYPE_GPU, this->opposingRank, 0, MPI_COMM_WORLD, &this->sendBufferIsReady );
-
-#else // USE_CUDA_AWARE_MPI
-
-    this->copyFromMeshToSendBuffer( dataBase );
-    
-    //CudaUtility::synchronizeCudaStream( CudaUtility::communicationStream );
 
     this->myAllocator->copyBuffersDeviceToHost( shared_from_this() );
-
-    //CudaUtility::synchronizeCudaStream( CudaUtility::copyDeviceToHostStream );
     
     CudaUtility::synchronizeCudaStream( CudaUtility::communicationStream );
 
-    MPI_Isend( this->sendBufferHost, this->numberOfSendNodes * LENGTH_CELL_DATA, MPI_TYPE_GPU, this->opposingRank, 0, MPI_COMM_WORLD, &this->sendBufferIsReady );
+    MPI_Isend( this->sendBufferHost, this->numberOfSendNodes * LENGTH_CELL_DATA, MPI_TYPE_GPU, this->opposingRank, tag, MPI_COMM_WORLD, &this->sendBufferIsReady );
 
 #endif // USE_CUDA_AWARE_MPI
 }
 
-void Communicator::recvData( SPtr<DataBase> dataBase )
+void Communicator::recvData( SPtr<DataBase> dataBase, int tag )
 {
 #ifdef USE_CUDA_AWARE_MPI
     
-    MPI_Recv ( this->recvBuffer, this->numberOfRecvNodes * LENGTH_CELL_DATA, MPI_TYPE_GPU, this->opposingRank, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE );
+    MPI_Recv ( this->recvBuffer, this->numberOfRecvNodes * LENGTH_CELL_DATA, MPI_TYPE_GPU, this->opposingRank, tag, MPI_COMM_WORLD, MPI_STATUSES_IGNORE );
     
     this->copyFromRecvBufferToMesh( dataBase );
 
 #else // USE_CUDA_AWARE_MPI
     
-    MPI_Recv ( this->recvBufferHost, this->numberOfRecvNodes * LENGTH_CELL_DATA, MPI_TYPE_GPU, this->opposingRank, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE );
+    MPI_Recv ( this->recvBufferHost, this->numberOfRecvNodes * LENGTH_CELL_DATA, MPI_TYPE_GPU, this->opposingRank, tag, MPI_COMM_WORLD, MPI_STATUSES_IGNORE );
     
     this->myAllocator->copyBuffersHostToDevice( shared_from_this() );
-    
-    //CudaUtility::synchronizeCudaStream( CudaUtility::copyHostToDeviceStream );
 
     this->copyFromRecvBufferToMesh( dataBase );
-    
-    //CudaUtility::synchronizeCudaStream( CudaUtility::communicationStream );
 
 #endif // USE_CUDA_AWARE_MPI
 }
diff --git a/src/GksGpu/Communication/Communicator.cu b/src/GksGpu/Communication/Communicator.cu
index cd285cb7b..bf2ca31af 100644
--- a/src/GksGpu/Communication/Communicator.cu
+++ b/src/GksGpu/Communication/Communicator.cu
@@ -70,8 +70,6 @@ void Communicator::copyFromMeshToSendBuffer(const SPtr<DataBase> dataBase)
                this->sendIndices,
                this->sendBuffer,
                0 );
-
-    //CudaUtility::synchronizeCudaStream( CudaUtility::communicationStream );
 }
 
 void Communicator::copyFromRecvBufferToMesh(const SPtr<DataBase> dataBase)
@@ -86,8 +84,6 @@ void Communicator::copyFromRecvBufferToMesh(const SPtr<DataBase> dataBase)
                this->recvIndices,
                this->recvBuffer,
                0 );
-
-    //CudaUtility::synchronizeCudaStream( CudaUtility::communicationStream );
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/GksGpu/Communication/Communicator.h b/src/GksGpu/Communication/Communicator.h
index fb11607cc..7434f121f 100644
--- a/src/GksGpu/Communication/Communicator.h
+++ b/src/GksGpu/Communication/Communicator.h
@@ -29,28 +29,29 @@ struct VF_PUBLIC Communicator : public std::enable_shared_from_this<Communicator
     real* sendBuffer; // device
     real* recvBuffer; // device
 
-    real* sendBufferHost;
-    real* recvBufferHost;
+    real* sendBufferHost; // pinned memory
+    real* recvBufferHost; // pinned memory
 
     uint rank;
     uint opposingRank;
 
     MPI_Request sendBufferIsReady;
 
+    static int tagSendPositive;
+    static int tagSendNegative;
+
     //////////////////////////////////////////////////////////////////////////
 
     Communicator( SPtr<DataBase> dataBase );
 
     void initialize( GksMeshAdapter& adapter, uint level, uint direction );
 
-    void exchangeData( SPtr<DataBase> dataBase );
-
     void copyFromMeshToSendBuffer( SPtr<DataBase> dataBase );
 
     void copyFromRecvBufferToMesh( SPtr<DataBase> dataBase );
 
-    void sendData( SPtr<DataBase> dataBase );
-    void recvData( SPtr<DataBase> dataBase );
+    void sendData( SPtr<DataBase> dataBase, int tag );
+    void recvData( SPtr<DataBase> dataBase, int tag );
 };
 
 #endif
diff --git a/src/GksGpu/CudaUtility/CudaUtility.cpp b/src/GksGpu/CudaUtility/CudaUtility.cpp
index e86156549..d77c928a8 100644
--- a/src/GksGpu/CudaUtility/CudaUtility.cpp
+++ b/src/GksGpu/CudaUtility/CudaUtility.cpp
@@ -10,8 +10,6 @@
 
 cudaStream_t CudaUtility::computeStream = nullptr;
 cudaStream_t CudaUtility::communicationStream = nullptr;
-cudaStream_t CudaUtility::copyDeviceToHostStream = nullptr;
-cudaStream_t CudaUtility::copyHostToDeviceStream = nullptr;
 
 CudaUtility::CudaGrid::CudaGrid( uint numberOfEntities, uint threadsPerBlock, cudaStream_t stream )
 {
@@ -62,11 +60,12 @@ void CudaUtility::setCudaDevice(int device)
     // slide 5
     int priority_high, priority_low;
     cudaDeviceGetStreamPriorityRange(&priority_low , &priority_high ) ;
+
+    // the flag needs to be cudaStreamDefault to ensure synchronization with default stream
+    //cudaStreamCreateWithPriority (&communicationStream, cudaStreamDefault, priority_high );
+    //cudaStreamCreateWithPriority (&computeStream      , cudaStreamDefault, priority_low  );
     cudaStreamCreateWithPriority (&communicationStream, cudaStreamNonBlocking, priority_high );
     cudaStreamCreateWithPriority (&computeStream      , cudaStreamNonBlocking, priority_low  );
-
-    cudaStreamCreate(&copyDeviceToHostStream);
-    cudaStreamCreate(&copyHostToDeviceStream);
 }
 
 int CudaUtility::getCudaDevice()
diff --git a/src/GksGpu/CudaUtility/CudaUtility.h b/src/GksGpu/CudaUtility/CudaUtility.h
index 9fac2a40a..716688d79 100644
--- a/src/GksGpu/CudaUtility/CudaUtility.h
+++ b/src/GksGpu/CudaUtility/CudaUtility.h
@@ -26,8 +26,6 @@ public:
 
     static cudaStream_t computeStream;
     static cudaStream_t communicationStream;
-    static cudaStream_t copyDeviceToHostStream;
-    static cudaStream_t copyHostToDeviceStream;
 
     static void printCudaMemoryUsage();
 
diff --git a/src/GksGpu/DataBase/DataBaseAllocatorGPU.cpp b/src/GksGpu/DataBase/DataBaseAllocatorGPU.cpp
index 8001fc1c6..38287f412 100644
--- a/src/GksGpu/DataBase/DataBaseAllocatorGPU.cpp
+++ b/src/GksGpu/DataBase/DataBaseAllocatorGPU.cpp
@@ -285,18 +285,14 @@ void DataBaseAllocatorGPU::copyDataDeviceToDevice(SPtr<Communicator> dst, SPtr<C
 
 void DataBaseAllocatorGPU::copyBuffersDeviceToHost(SPtr<Communicator> communicator)
 {
-    //checkCudaErrors( cudaMemcpyAsync ( communicator->sendBufferHost, communicator->sendBuffer, LENGTH_CELL_DATA * sizeof(real) * communicator->numberOfSendNodes, cudaMemcpyDeviceToHost, CudaUtility::copyDeviceToHostStream ) );
-    checkCudaErrors( cudaMemcpyAsync ( communicator->sendBufferHost, communicator->sendBuffer, LENGTH_CELL_DATA * sizeof(real) * communicator->numberOfSendNodes, cudaMemcpyDeviceToHost, CudaUtility::communicationStream ) );
-
-    //CudaUtility::synchronizeCudaStream( CudaUtility::copyDeviceToHostStream );
+    size_t size = LENGTH_CELL_DATA * sizeof(real) * communicator->numberOfSendNodes;
+    cudaMemcpyAsync ( communicator->sendBufferHost, communicator->sendBuffer, size, cudaMemcpyDeviceToHost, CudaUtility::communicationStream );
 }
 
 void DataBaseAllocatorGPU::copyBuffersHostToDevice(SPtr<Communicator> communicator)
 {
-    //checkCudaErrors( cudaMemcpyAsync ( communicator->recvBuffer, communicator->recvBufferHost, LENGTH_CELL_DATA * sizeof(real) * communicator->numberOfRecvNodes, cudaMemcpyHostToDevice, CudaUtility::copyHostToDeviceStream ) );
-    checkCudaErrors( cudaMemcpyAsync ( communicator->recvBuffer, communicator->recvBufferHost, LENGTH_CELL_DATA * sizeof(real) * communicator->numberOfRecvNodes, cudaMemcpyHostToDevice, CudaUtility::communicationStream ) );
-
-    //CudaUtility::synchronizeCudaStream( CudaUtility::copyHostToDeviceStream );
+    size_t size = LENGTH_CELL_DATA * sizeof(real) * communicator->numberOfRecvNodes;
+    cudaMemcpyAsync ( communicator->recvBuffer, communicator->recvBufferHost, size, cudaMemcpyHostToDevice, CudaUtility::communicationStream );
 }
 
 std::string DataBaseAllocatorGPU::getDeviceType()
diff --git a/src/GksGpu/TimeStepping/NestedTimeStep.cpp b/src/GksGpu/TimeStepping/NestedTimeStep.cpp
index ba4676629..6f35699d9 100644
--- a/src/GksGpu/TimeStepping/NestedTimeStep.cpp
+++ b/src/GksGpu/TimeStepping/NestedTimeStep.cpp
@@ -23,11 +23,6 @@ void TimeStepping::nestedTimeStep( SPtr<DataBase> dataBase,
 
     //////////////////////////////////////////////////////////////////////////
 
-    //set different viscosity on specific levels
-    //if( level >= 3 ) parameters.mu = 1.0e-3;
-
-    //////////////////////////////////////////////////////////////////////////
-
     if( level != dataBase->numberOfLevels - 1 )
     {
         Interface::runFineToCoarse( dataBase, level );
@@ -42,6 +37,10 @@ void TimeStepping::nestedTimeStep( SPtr<DataBase> dataBase,
 
     //////////////////////////////////////////////////////////////////////////
 
+    CudaUtility::synchronizeCudaDevice();
+
+    //////////////////////////////////////////////////////////////////////////
+
     FluxComputation::run( dataBase, parameters, level );
     
     //////////////////////////////////////////////////////////////////////////
@@ -55,52 +54,31 @@ void TimeStepping::nestedTimeStep( SPtr<DataBase> dataBase,
         //
         //////////////////////////////////////////////////////////////////////////
         //////////////////////////////////////////////////////////////////////////
-        //for( uint direction = 0; direction < 6; direction++ )
-        //{
-        //    if( dataBase->communicators[level][direction] != nullptr )
-        //    {
-        //        dataBase->communicators[level][direction]->exchangeData(dataBase);
-        //    }
-        //}
-        //////////////////////////////////////////////////////////////////////////
-        //for( uint direction = 0; direction < 6; direction++ )
-        //{
-        //    if( dataBase->communicators[level][direction] != nullptr )
-        //    {
-        //        dataBase->communicators[level][direction]->sendData(dataBase);
-        //    }
-        //}
-        //for( uint direction = 0; direction < 6; direction++ )
-        //{
-        //    if( dataBase->communicators[level][direction] != nullptr )
-        //    {
-        //        dataBase->communicators[level][direction]->recvData(dataBase);
-        //    }
-        //}
+
         //////////////////////////////////////////////////////////////////////////
         // X
         //////////////////////////////////////////////////////////////////////////
-        if( dataBase->communicators[level][0] != nullptr ) dataBase->communicators[level][0]->sendData(dataBase);
-        if( dataBase->communicators[level][1] != nullptr ) dataBase->communicators[level][1]->sendData(dataBase);
+        if( dataBase->communicators[level][0] != nullptr ) dataBase->communicators[level][0]->sendData(dataBase, Communicator::tagSendNegative);
+        if( dataBase->communicators[level][1] != nullptr ) dataBase->communicators[level][1]->sendData(dataBase, Communicator::tagSendPositive);
 
-        if( dataBase->communicators[level][0] != nullptr ) dataBase->communicators[level][0]->recvData(dataBase);
-        if( dataBase->communicators[level][1] != nullptr ) dataBase->communicators[level][1]->recvData(dataBase);
+        if( dataBase->communicators[level][0] != nullptr ) dataBase->communicators[level][0]->recvData(dataBase, Communicator::tagSendPositive);
+        if( dataBase->communicators[level][1] != nullptr ) dataBase->communicators[level][1]->recvData(dataBase, Communicator::tagSendNegative);
         //////////////////////////////////////////////////////////////////////////
         // Y
         //////////////////////////////////////////////////////////////////////////
-        if( dataBase->communicators[level][2] != nullptr ) dataBase->communicators[level][2]->sendData(dataBase);
-        if( dataBase->communicators[level][3] != nullptr ) dataBase->communicators[level][3]->sendData(dataBase);
+        if( dataBase->communicators[level][2] != nullptr ) dataBase->communicators[level][2]->sendData(dataBase, Communicator::tagSendNegative);
+        if( dataBase->communicators[level][3] != nullptr ) dataBase->communicators[level][3]->sendData(dataBase, Communicator::tagSendPositive);
         
-        if( dataBase->communicators[level][2] != nullptr ) dataBase->communicators[level][2]->recvData(dataBase);
-        if( dataBase->communicators[level][3] != nullptr ) dataBase->communicators[level][3]->recvData(dataBase);
+        if( dataBase->communicators[level][2] != nullptr ) dataBase->communicators[level][2]->recvData(dataBase, Communicator::tagSendPositive);
+        if( dataBase->communicators[level][3] != nullptr ) dataBase->communicators[level][3]->recvData(dataBase, Communicator::tagSendNegative);
         //////////////////////////////////////////////////////////////////////////
         // Z
         //////////////////////////////////////////////////////////////////////////
-        if( dataBase->communicators[level][4] != nullptr ) dataBase->communicators[level][4]->sendData(dataBase);
-        if( dataBase->communicators[level][5] != nullptr ) dataBase->communicators[level][5]->sendData(dataBase);
+        if( dataBase->communicators[level][4] != nullptr ) dataBase->communicators[level][4]->sendData(dataBase, Communicator::tagSendNegative);
+        if( dataBase->communicators[level][5] != nullptr ) dataBase->communicators[level][5]->sendData(dataBase, Communicator::tagSendPositive);
         
-        if( dataBase->communicators[level][4] != nullptr ) dataBase->communicators[level][4]->recvData(dataBase);
-        if( dataBase->communicators[level][5] != nullptr ) dataBase->communicators[level][5]->recvData(dataBase);
+        if( dataBase->communicators[level][4] != nullptr ) dataBase->communicators[level][4]->recvData(dataBase, Communicator::tagSendPositive);
+        if( dataBase->communicators[level][5] != nullptr ) dataBase->communicators[level][5]->recvData(dataBase, Communicator::tagSendNegative);
     }
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/targets/apps/GKS/MultiGPU/MultiGPU.cpp b/targets/apps/GKS/MultiGPU/MultiGPU.cpp
index d54356812..38fcdbad5 100644
--- a/targets/apps/GKS/MultiGPU/MultiGPU.cpp
+++ b/targets/apps/GKS/MultiGPU/MultiGPU.cpp
@@ -261,7 +261,7 @@ void performanceTest( std::string path, std::string simulationName, uint decompo
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    const uint numberOfIterations = 10;
+    const uint numberOfIterations = 1000;
 
     CupsAnalyzer cupsAnalyzer( dataBase, true, 30.0, true, numberOfIterations );
 
@@ -324,7 +324,7 @@ int main( int argc, char* argv[])
     //////////////////////////////////////////////////////////////////////////
 
     bool strongScaling = false;
-    uint nx = 64;
+    uint nx = 128;
 
     if( argc > 1 ) path += argv[1]; path += "/";
     if( argc > 2 ) nx = atoi( argv[2] );
@@ -363,7 +363,7 @@ int main( int argc, char* argv[])
     //////////////////////////////////////////////////////////////////////////
 
     if( sizeof(real) == 4 )
-        *logging::out << logging::Logger::INFO_HIGH << "Using Single Precison\n";
+        *logging::out << logging::Logger::INFO_HIGH << "Using Single Precision\n";
     else
         *logging::out << logging::Logger::INFO_HIGH << "Using Double Precision\n";
 
diff --git a/targets/apps/GKS/MultiGPU_2D/MultiGPU_2D.cpp b/targets/apps/GKS/MultiGPU_2D/MultiGPU_2D.cpp
index 0288c6e50..6d5cdd57f 100644
--- a/targets/apps/GKS/MultiGPU_2D/MultiGPU_2D.cpp
+++ b/targets/apps/GKS/MultiGPU_2D/MultiGPU_2D.cpp
@@ -70,29 +70,50 @@ void performanceTest( std::string path, std::string simulationName, uint decompo
     
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    int sideLength, rankX, rankY, rankZ;
+    int sideLengthX, sideLengthY, sideLengthZ, rankX, rankY, rankZ;
 
-    if( decompositionDimension == 2 )
+    if( decompositionDimension == 1 )
     {
-        if      (mpiWorldSize == 1)  sideLength = 1;
-        else if (mpiWorldSize == 4)  sideLength = 2;
-        else if (mpiWorldSize == 16) sideLength = 4;
-
-        rankX = rank % sideLength;
-        rankY = rank / sideLength;
+        if      (mpiWorldSize == 1 ) { sideLengthX = 1 ; sideLengthY = 1; sideLengthZ = 1; }
+        else if (mpiWorldSize == 2 ) { sideLengthX = 2 ; sideLengthY = 1; sideLengthZ = 1; }
+        else if (mpiWorldSize == 4 ) { sideLengthX = 4 ; sideLengthY = 1; sideLengthZ = 1; }
+        else if (mpiWorldSize == 8 ) { sideLengthX = 8 ; sideLengthY = 1; sideLengthZ = 1; }
+        else if (mpiWorldSize == 16) { sideLengthX = 16; sideLengthY = 1; sideLengthZ = 1; }
+        else if (mpiWorldSize == 32) { sideLengthX = 32; sideLengthY = 1; sideLengthZ = 1; }
+
+        rankX = rank;
+        rankY = 0;
+        rankZ = 0;
+    }
+    else if( decompositionDimension == 2 )
+    {
+        if      (mpiWorldSize == 1 ) { sideLengthX = 1; sideLengthY = 1; sideLengthZ = 1; }
+        else if (mpiWorldSize == 2 ) { sideLengthX = 2; sideLengthY = 1; sideLengthZ = 1; }
+        else if (mpiWorldSize == 4 ) { sideLengthX = 2; sideLengthY = 2; sideLengthZ = 1; }
+        else if (mpiWorldSize == 8 ) { sideLengthX = 4; sideLengthY = 2; sideLengthZ = 1; }
+        else if (mpiWorldSize == 16) { sideLengthX = 4; sideLengthY = 4; sideLengthZ = 1; }
+        else if (mpiWorldSize == 32) { sideLengthX = 8; sideLengthY = 4; sideLengthZ = 1; }
+
+        rankX = rank % sideLengthX;
+        rankY = rank / sideLengthX;
         rankZ = 0;
     }
     else if( decompositionDimension == 3 )
     {
-        if      (mpiWorldSize == 1)  sideLength = 1;
-        else if (mpiWorldSize == 8)  sideLength = 2;
-
-        rankX =   rank % sideLength;
-        rankY = ( rank % ( sideLength * sideLength ) ) / sideLength;
-        rankZ =   rank                                 / ( sideLength * sideLength );
+        if      (mpiWorldSize == 1 ) { sideLengthX = 1; sideLengthY = 1; sideLengthZ = 1; }
+        else if (mpiWorldSize == 2 ) { sideLengthX = 2; sideLengthY = 1; sideLengthZ = 1; }
+        else if (mpiWorldSize == 4 ) { sideLengthX = 2; sideLengthY = 2; sideLengthZ = 1; }
+        else if (mpiWorldSize == 8 ) { sideLengthX = 2; sideLengthY = 2; sideLengthZ = 2; }
+        else if (mpiWorldSize == 16) { sideLengthX = 4; sideLengthY = 2; sideLengthZ = 2; }
+        else if (mpiWorldSize == 32) { sideLengthX = 4; sideLengthY = 4; sideLengthZ = 2; }
+
+        rankX =   rank %   sideLengthX;
+        rankY = ( rank % ( sideLengthX * sideLengthY ) ) /   sideLengthX;
+        rankZ =   rank                                   / ( sideLengthY * sideLengthX );
     }
 
-    *logging::out << logging::Logger::INFO_HIGH << sideLength << " " << rankX << " " << rankY << " " << rankZ << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "SideLength = " << sideLengthX << " " << sideLengthY << " " << sideLengthZ << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "rank       = " << rankX << " " << rankY << " " << rankZ << "\n";
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -106,16 +127,20 @@ void performanceTest( std::string path, std::string simulationName, uint decompo
 
     if( strongScaling )
     {
-        if( decompositionDimension == 2 )
+        if( decompositionDimension == 1 )
         {
-            LX /= double(sideLength);
-            LY /= double(sideLength);
+            LX /= double(sideLengthX);
+        }
+        else if( decompositionDimension == 2 )
+        {
+            LX /= double(sideLengthX);
+            LY /= double(sideLengthY);
         }
         else if( decompositionDimension == 3 )
         {
-            LX /= double(sideLength);
-            LY /= double(sideLength);
-            LZ /= double(sideLength);
+            LX /= double(sideLengthX);
+            LY /= double(sideLengthY);
+            LZ /= double(sideLengthZ);
         }
     }
 
@@ -127,7 +152,7 @@ void performanceTest( std::string path, std::string simulationName, uint decompo
     parameters.Pr = 1;
     parameters.mu = 0.01;
 
-    parameters.force.x = 0.1;
+    parameters.force.x = 0;
     parameters.force.y = 0;
     parameters.force.z = 0;
 
@@ -146,98 +171,60 @@ void performanceTest( std::string path, std::string simulationName, uint decompo
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    if( mpiWorldSize > 1 )
-    {
-        //if( decompositionDimension == 2 )
-        //{
-            gridBuilder->addCoarseGrid(  rankX*LX    - 0.5*L - 5.0*dx,      rankY*LY    - 0.5*L - 5.0*dx,      rankZ*LZ    - 0.5*L - 5.0*dx,
-                                        (rankX*LX+1) - 0.5*L + 5.0*dx,     (rankY*LY+1) - 0.5*L + 5.0*dx,     (rankZ*LZ+1) - 0.5*L + 5.0*dx, dx);
-        //}
-
-        //if( decompositionDimension == 3 )
-        //{
-        //    gridBuilder->addCoarseGrid( rankX*LX - 0.5*L - 5.0*dx,     rankY*LY - 0.5*L - 5.0*dx,     rankZ*LZ - 0.5*L - 5.0*dx,  
-        //                                rankX*LX + 0.5*L + 5.0*dx,     rankY*LY + 0.5*L + 5.0*dx,     rankZ*LZ + 0.5*L + 5.0*dx, dx);
-        //}
-
-        gridBuilder->setSubDomainBox( std::make_shared<BoundingBox>( rankX*LX - 0.5*L, (rankX+1)*LX - 0.5*L, 
-                                                                     rankY*LY - 0.5*L, (rankY+1)*LY - 0.5*L,
-                                                                     rankZ*LZ - 0.5*L, (rankZ+1)*LZ - 0.5*L  ) );
-    }
-    else
-    {
-        gridBuilder->addCoarseGrid( -0.5*L, -0.5*L, -0.5*L,  
-                                     0.5*L,  0.5*L,  0.5*L, dx);
-    }
-
-    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    gridBuilder->setPeriodicBoundaryCondition(true, true, true);
+    real xOverlap = ( sideLengthX == 1 ) ? 0.0 : 5.0*dx;
+    real yOverlap = ( sideLengthY == 1 ) ? 0.0 : 5.0*dx;
+    real zOverlap = ( sideLengthZ == 1 ) ? 0.0 : 5.0*dx;
 
-    gridBuilder->buildGrids(GKS, false);
+    gridBuilder->addCoarseGrid(  rankX*LX    - 0.5*L - xOverlap,      rankY*LY    - 0.5*L - yOverlap,      rankZ*LZ    - 0.5*L - zOverlap,
+                                (rankX*LX+1) - 0.5*L + xOverlap,     (rankY*LY+1) - 0.5*L + yOverlap,     (rankZ*LZ+1) - 0.5*L + zOverlap, dx);
 
-    MPI_Barrier(MPI_COMM_WORLD);
+    gridBuilder->setSubDomainBox( std::make_shared<BoundingBox>( rankX*LX - 0.5*L, (rankX+1)*LX - 0.5*L, 
+                                                                 rankY*LY - 0.5*L, (rankY+1)*LY - 0.5*L,
+                                                                 rankZ*LZ - 0.5*L, (rankZ+1)*LZ - 0.5*L  ) );
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    if( decompositionDimension == 1 && mpiWorldSize > 1 )
-    {
-        gridBuilder->findCommunicationIndices( CommunicationDirections::PX, GKS );
-        gridBuilder->setCommunicationProcess ( CommunicationDirections::PX, (rank + 1 + mpiWorldSize) % mpiWorldSize );
+    gridBuilder->setPeriodicBoundaryCondition(sideLengthX == 1, sideLengthY == 1, sideLengthZ == 1);
 
-        gridBuilder->findCommunicationIndices( CommunicationDirections::MX, GKS );
-        gridBuilder->setCommunicationProcess ( CommunicationDirections::MX, (rank - 1 + mpiWorldSize) % mpiWorldSize );
-    }
-    else if ( decompositionDimension == 2 && mpiWorldSize > 1 )
-    {
-        int rankPX = ( (rankX + 1 + sideLength) % sideLength ) +    rankY                                  * sideLength;
-        int rankMX = ( (rankX - 1 + sideLength) % sideLength ) +    rankY                                  * sideLength;
-        int rankPY =    rankX                                  + ( (rankY + 1 + sideLength) % sideLength ) * sideLength;
-        int rankMY =    rankX                                  + ( (rankY - 1 + sideLength) % sideLength ) * sideLength;
+    *logging::out << logging::Logger::INFO_HIGH << "periodicity = " << (sideLengthX == 1) << " " << (sideLengthY == 1) << " " << (sideLengthZ == 1) << "\n";
 
-        gridBuilder->findCommunicationIndices( CommunicationDirections::PX, GKS );
-        gridBuilder->setCommunicationProcess ( CommunicationDirections::PX, rankPX);
+    gridBuilder->buildGrids(GKS, false);
 
-        gridBuilder->findCommunicationIndices( CommunicationDirections::MX, GKS );
-        gridBuilder->setCommunicationProcess ( CommunicationDirections::MX, rankMX);
+    MPI_Barrier(MPI_COMM_WORLD);
 
-        gridBuilder->findCommunicationIndices( CommunicationDirections::PY, GKS );
-        gridBuilder->setCommunicationProcess ( CommunicationDirections::PY, rankPY);
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-        gridBuilder->findCommunicationIndices( CommunicationDirections::MY, GKS );
-        gridBuilder->setCommunicationProcess ( CommunicationDirections::MY, rankMY);
-    }
-    else if ( decompositionDimension == 3 && mpiWorldSize > 1 )
+    if( mpiWorldSize > 1 )
     {
-        int rankPX = ( (rankX + 1 + sideLength) % sideLength ) +    rankY                                  * sideLength +    rankZ                                  * sideLength * sideLength;
-        int rankMX = ( (rankX - 1 + sideLength) % sideLength ) +    rankY                                  * sideLength +    rankZ                                  * sideLength * sideLength;
-        int rankPY =    rankX                                  + ( (rankY + 1 + sideLength) % sideLength ) * sideLength +    rankZ                                  * sideLength * sideLength;
-        int rankMY =    rankX                                  + ( (rankY - 1 + sideLength) % sideLength ) * sideLength +    rankZ                                  * sideLength * sideLength;
-        int rankPZ =    rankX                                  +    rankY                                  * sideLength + ( (rankZ + 1 + sideLength) % sideLength ) * sideLength * sideLength;
-        int rankMZ =    rankX                                  +    rankY                                  * sideLength + ( (rankZ - 1 + sideLength) % sideLength ) * sideLength * sideLength;
+        int rankPX = ( (rankX + 1 + sideLengthX) % sideLengthX ) +    rankY                                    * sideLengthX +    rankZ                                    * sideLengthX * sideLengthY;
+        int rankMX = ( (rankX - 1 + sideLengthX) % sideLengthX ) +    rankY                                    * sideLengthX +    rankZ                                    * sideLengthX * sideLengthY;
+        int rankPY =    rankX                                    + ( (rankY + 1 + sideLengthY) % sideLengthY ) * sideLengthX +    rankZ                                    * sideLengthX * sideLengthY;
+        int rankMY =    rankX                                    + ( (rankY - 1 + sideLengthY) % sideLengthY ) * sideLengthX +    rankZ                                    * sideLengthX * sideLengthY;
+        int rankPZ =    rankX                                    +    rankY                                    * sideLengthX + ( (rankZ + 1 + sideLengthZ) % sideLengthZ ) * sideLengthX * sideLengthY;
+        int rankMZ =    rankX                                    +    rankY                                    * sideLengthX + ( (rankZ - 1 + sideLengthZ) % sideLengthZ ) * sideLengthX * sideLengthY;
 
-        gridBuilder->findCommunicationIndices( CommunicationDirections::PX, GKS );
-        gridBuilder->setCommunicationProcess ( CommunicationDirections::PX, rankPX);
+        if( sideLengthX > 1 ) gridBuilder->findCommunicationIndices( CommunicationDirections::PX, GKS );
+        if( sideLengthX > 1 ) gridBuilder->setCommunicationProcess ( CommunicationDirections::PX, rankPX);
 
-        gridBuilder->findCommunicationIndices( CommunicationDirections::MX, GKS );
-        gridBuilder->setCommunicationProcess ( CommunicationDirections::MX, rankMX);
+        if( sideLengthX > 1 ) gridBuilder->findCommunicationIndices( CommunicationDirections::MX, GKS );
+        if( sideLengthX > 1 ) gridBuilder->setCommunicationProcess ( CommunicationDirections::MX, rankMX);
 
-        gridBuilder->findCommunicationIndices( CommunicationDirections::PY, GKS );
-        gridBuilder->setCommunicationProcess ( CommunicationDirections::PY, rankPY);
+        if( sideLengthY > 1 ) gridBuilder->findCommunicationIndices( CommunicationDirections::PY, GKS );
+        if( sideLengthY > 1 ) gridBuilder->setCommunicationProcess ( CommunicationDirections::PY, rankPY);
 
-        gridBuilder->findCommunicationIndices( CommunicationDirections::MY, GKS );
-        gridBuilder->setCommunicationProcess ( CommunicationDirections::MY, rankMY);
+        if( sideLengthY > 1 ) gridBuilder->findCommunicationIndices( CommunicationDirections::MY, GKS );
+        if( sideLengthY > 1 ) gridBuilder->setCommunicationProcess ( CommunicationDirections::MY, rankMY);
 
-        gridBuilder->findCommunicationIndices( CommunicationDirections::PZ, GKS );
-        gridBuilder->setCommunicationProcess ( CommunicationDirections::PZ, rankPZ);
+        if( sideLengthZ > 1 ) gridBuilder->findCommunicationIndices( CommunicationDirections::PZ, GKS );
+        if( sideLengthZ > 1 ) gridBuilder->setCommunicationProcess ( CommunicationDirections::PZ, rankPZ);
 
-        gridBuilder->findCommunicationIndices( CommunicationDirections::MZ, GKS );
-        gridBuilder->setCommunicationProcess ( CommunicationDirections::MZ, rankMZ);
+        if( sideLengthZ > 1 ) gridBuilder->findCommunicationIndices( CommunicationDirections::MZ, GKS );
+        if( sideLengthZ > 1 ) gridBuilder->setCommunicationProcess ( CommunicationDirections::MZ, rankMZ);
 
-        *logging::out << logging::Logger::INFO_HIGH << rankPX << " " << rankMX << " " << rankPY << " " << rankMY << "\n";
+        *logging::out << logging::Logger::INFO_HIGH << "neighborRanks = " << rankPX << " " << rankMX << " " << rankPY << " " << rankMY << " " << rankPY << " " << rankMY << "\n";
     }
 
-    gridBuilder->writeGridsToVtk(path + "/Grid_rank_" + std::to_string(rank) + "_lev_");
+    //gridBuilder->writeGridsToVtk(path + "/Grid_rank_" + std::to_string(rank) + "_lev_");
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -245,9 +232,7 @@ void performanceTest( std::string path, std::string simulationName, uint decompo
 
     meshAdapter.inputGrid();
 
-    meshAdapter.getCommunicationIndices();
-
-    meshAdapter.findPeriodicBoundaryNeighbors();
+    if( sideLengthX == 1 || sideLengthY == 1 || sideLengthZ == 1 ) meshAdapter.findPeriodicBoundaryNeighbors();
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -260,58 +245,49 @@ void performanceTest( std::string path, std::string simulationName, uint decompo
 
     SPtr<BoundaryCondition> bcMX = std::make_shared<Periodic>( dataBase );
     SPtr<BoundaryCondition> bcPX = std::make_shared<Periodic>( dataBase );
-    //SPtr<BoundaryCondition> bcMX = std::make_shared<AdiabaticWall>( dataBase, Vec3(0.0, 0.0, 0.0), false );
-    //SPtr<BoundaryCondition> bcPX = std::make_shared<AdiabaticWall>( dataBase, Vec3(0.0, 0.1, 0.0), false );
 
-    bcMX->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.x < -0.5*L; } );
-    bcPX->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.x >  0.5*L; } );
+    if( sideLengthX == 1 ) bcMX->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.x < -0.5*L; } );
+    if( sideLengthX == 1 ) bcPX->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.x >  0.5*L; } );
 
     //////////////////////////////////////////////////////////////////////////
 
-    //SPtr<BoundaryCondition> bcMY = std::make_shared<Periodic>( dataBase );
-    //SPtr<BoundaryCondition> bcPY = std::make_shared<Periodic>( dataBase );
     SPtr<BoundaryCondition> bcMY = std::make_shared<Periodic>( dataBase );
     SPtr<BoundaryCondition> bcPY = std::make_shared<Periodic>( dataBase );
 
-    bcMY->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.y < -0.5*L; } );
-    bcPY->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.y >  0.5*L; } );
+    if( sideLengthY == 1 ) bcMY->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.y < -0.5*L; } );
+    if( sideLengthY == 1 ) bcPY->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.y >  0.5*L; } );
 
     //////////////////////////////////////////////////////////////////////////
     
-    //SPtr<BoundaryCondition> bcMZ = std::make_shared<Periodic>( dataBase );
-    //SPtr<BoundaryCondition> bcPZ = std::make_shared<Periodic>( dataBase );
     SPtr<BoundaryCondition> bcMZ = std::make_shared<Periodic>( dataBase );
     SPtr<BoundaryCondition> bcPZ = std::make_shared<Periodic>( dataBase );
     
-    bcMZ->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.z < -0.5*L; } );
-    bcPZ->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.z >  0.5*L; } );
+    if( sideLengthZ == 1 ) bcMZ->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.z < -0.5*L; } );
+    if( sideLengthZ == 1 ) bcPZ->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.z >  0.5*L; } );
 
     //////////////////////////////////////////////////////////////////////////
 
-    if( mpiWorldSize == 1 )
-    {
-        dataBase->boundaryConditions.push_back( bcMX );
-        dataBase->boundaryConditions.push_back( bcPX );
+    if( sideLengthX == 1 ) dataBase->boundaryConditions.push_back( bcMX );
+    if( sideLengthX == 1 ) dataBase->boundaryConditions.push_back( bcPX );
     
-        dataBase->boundaryConditions.push_back( bcMY );
-        dataBase->boundaryConditions.push_back( bcPY );
-    }
-    if( decompositionDimension == 2 || mpiWorldSize == 1 )
-    {
-        dataBase->boundaryConditions.push_back( bcMZ );
-        dataBase->boundaryConditions.push_back( bcPZ );
-    }
+    if( sideLengthY == 1 ) dataBase->boundaryConditions.push_back( bcMY );
+    if( sideLengthY == 1 ) dataBase->boundaryConditions.push_back( bcPY );
+
+    if( sideLengthZ == 1 ) dataBase->boundaryConditions.push_back( bcMZ );
+    if( sideLengthZ == 1 ) dataBase->boundaryConditions.push_back( bcPZ );
 
     //////////////////////////////////////////////////////////////////////////
 
-    *logging::out << logging::Logger::INFO_HIGH << "bcMX ==> " << bcMX->numberOfCellsPerLevel[0] << "\n";
-    *logging::out << logging::Logger::INFO_HIGH << "bcPX ==> " << bcPX->numberOfCellsPerLevel[0] << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "NumberOfBoundaryConditions = " << (int)dataBase->boundaryConditions.size() << "\n";
+
+    if( sideLengthX == 1 ) *logging::out << logging::Logger::INFO_HIGH << "bcMX ==> " << bcMX->numberOfCellsPerLevel[0] << "\n";
+    if( sideLengthX == 1 ) *logging::out << logging::Logger::INFO_HIGH << "bcPX ==> " << bcPX->numberOfCellsPerLevel[0] << "\n";
 
-    *logging::out << logging::Logger::INFO_HIGH << "bcMY ==> " << bcMY->numberOfCellsPerLevel[0] << "\n";
-    *logging::out << logging::Logger::INFO_HIGH << "bcPY ==> " << bcPY->numberOfCellsPerLevel[0] << "\n";
+    if( sideLengthY == 1 ) *logging::out << logging::Logger::INFO_HIGH << "bcMY ==> " << bcMY->numberOfCellsPerLevel[0] << "\n";
+    if( sideLengthY == 1 ) *logging::out << logging::Logger::INFO_HIGH << "bcPY ==> " << bcPY->numberOfCellsPerLevel[0] << "\n";
 
-    *logging::out << logging::Logger::INFO_HIGH << "bcMZ ==> " << bcMZ->numberOfCellsPerLevel[0] << "\n";
-    *logging::out << logging::Logger::INFO_HIGH << "bcPZ ==> " << bcPZ->numberOfCellsPerLevel[0] << "\n";
+    if( sideLengthZ == 1 ) *logging::out << logging::Logger::INFO_HIGH << "bcMZ ==> " << bcMZ->numberOfCellsPerLevel[0] << "\n";
+    if( sideLengthZ == 1 ) *logging::out << logging::Logger::INFO_HIGH << "bcPZ ==> " << bcPZ->numberOfCellsPerLevel[0] << "\n";
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -321,18 +297,38 @@ void performanceTest( std::string path, std::string simulationName, uint decompo
     dataBase->setMesh( meshAdapter );
 
     dataBase->setCommunicators( meshAdapter );
-    
-    //*logging::out << logging::Logger::WARNING << int(dataBase->communicators[0].size()) << "\n";
-    //*logging::out << logging::Logger::WARNING << int(dataBase->communicators[0][0].get()) << "\n";
-    //*logging::out << logging::Logger::WARNING << int(dataBase->communicators[0][1].get()) << "\n";
 
     CudaUtility::printCudaMemoryUsage();
 
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
     Initializer::interpret(dataBase, [&] ( Vec3 cellCenter ) -> ConservedVariables
     {
-        return toConservedVariables( PrimitiveVariables( 1.0, 1.0, 0.0, 0.0, parameters.lambdaRef ), parameters.K );
+        real U = 0.1;
+
+        real ULocal =   /*0.1*/ + U * sin( 2.0 * M_PI * cellCenter.x ) * cos( 2.0 * M_PI * cellCenter.y ) * cos( 2.0 * M_PI * cellCenter.z );
+        real VLocal =   /*0.1*/ - U * cos( 2.0 * M_PI * cellCenter.x ) * sin( 2.0 * M_PI * cellCenter.y ) * cos( 2.0 * M_PI * cellCenter.z );
+        real WLocal =   0.1;
+
+        real rho = 1.0;
+
+        real p0 = 0.5 * rho / parameters.lambdaRef;
+
+        real pLocal = p0 + rho * U * U / 16.0 * ( cos( 2.0 * M_PI * 2.0 * cellCenter.x ) + cos( 2.0 * M_PI * 2.0 * cellCenter.y ) ) * ( 2.0 + cos( 2.0 * M_PI * 2.0 * cellCenter.z ) );
+
+        real rhoLocal = 2.0 * pLocal * parameters.lambdaRef;
+
+        //ULocal = cellCenter.x;
+        //VLocal = cellCenter.y;
+        //WLocal = cellCenter.z;
+
+        //rhoLocal = rank + 1;
+
+        return toConservedVariables( PrimitiveVariables( rhoLocal, ULocal, VLocal, WLocal, parameters.lambdaRef ), parameters.K );
     });
 
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
     dataBase->copyDataHostToDevice();
 
     for( auto bc : dataBase->boundaryConditions ) 
@@ -346,45 +342,62 @@ void performanceTest( std::string path, std::string simulationName, uint decompo
     if( rank == 0 ) writeVtkXMLParallelSummaryFile( dataBase, parameters, path + simulationName + "_0", mpiWorldSize );
 
     writeVtkXML( dataBase, parameters, 0, path + simulationName + "_0" + "_rank_" + std::to_string(rank) );
-
+    
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    CupsAnalyzer cupsAnalyzer( dataBase, true, 30.0 );
+    const uint numberOfIterations = 1000;
 
-    //////////////////////////////////////////////////////////////////////////
+    CupsAnalyzer cupsAnalyzer( dataBase, true, 30.0, true, numberOfIterations );
+
+    MPI_Barrier(MPI_COMM_WORLD);
 
     cupsAnalyzer.start();
 
-    for( uint iter = 1; iter <= 10000; iter++ )
+    for( uint iter = 1; iter <= numberOfIterations; iter++ )
     {
         TimeStepping::nestedTimeStep(dataBase, parameters, 0);
+    }
 
-        cupsAnalyzer.run( iter );
+    cupsAnalyzer.run( numberOfIterations, parameters.dt );
 
-        if( iter % 10000 == 0 )
-        {
-            dataBase->copyDataDeviceToHost();
+    //////////////////////////////////////////////////////////////////////////
 
-            if( rank == 0 ) writeVtkXMLParallelSummaryFile( dataBase, parameters, path + simulationName + "_" + std::to_string( iter ), mpiWorldSize );
+    dataBase->copyDataDeviceToHost();
 
-            writeVtkXML( dataBase, parameters, 0, path + simulationName + "_" + std::to_string( iter ) + "_rank_" + std::to_string(rank) );
-        }
-    }
+    if( rank == 0 ) writeVtkXMLParallelSummaryFile( dataBase, parameters, path + simulationName + "_final", mpiWorldSize );
 
+    writeVtkXML( dataBase, parameters, 0, path + simulationName + "_final_rank_" + std::to_string(rank) );
+    
     //////////////////////////////////////////////////////////////////////////
 
-    dataBase->copyDataDeviceToHost();
+    int crashCellIndex = dataBase->getCrashCellIndex();
+    if( crashCellIndex >= 0 )
+    {
+        *logging::out << logging::Logger::LOGGER_ERROR << "=================================================\n";
+        *logging::out << logging::Logger::LOGGER_ERROR << "=================================================\n";
+        *logging::out << logging::Logger::LOGGER_ERROR << "============= Simulation Crashed!!! =============\n";
+        *logging::out << logging::Logger::LOGGER_ERROR << "=================================================\n";
+        *logging::out << logging::Logger::LOGGER_ERROR << "=================================================\n";
+    }
 }
 
 int main( int argc, char* argv[])
 {
     //////////////////////////////////////////////////////////////////////////
-
+    
+    int rank = 0;
+    int mpiWorldSize = 1;
+#ifdef USE_CUDA_AWARE_MPI
     int rank         = MpiUtility::getMpiRankBeforeInit();
     int mpiWorldSize = MpiUtility::getMpiWorldSizeBeforeInit();
+#else
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiWorldSize);
+#endif
 
     //////////////////////////////////////////////////////////////////////////
 
@@ -395,21 +408,22 @@ int main( int argc, char* argv[])
     std::string path( "out/" );
 #endif
 
-    std::string simulationName ( "MultiGPU_np_" + std::to_string(mpiWorldSize) );
-
     //////////////////////////////////////////////////////////////////////////
 
-    bool strongScaling = false;
+    bool strongScaling = true;
     uint nx = 64;
+    uint decompositionDimension = 3;
 
-    uint decompositionDimension = 2;
-
-    if( argc > 1 ) path += argv[1]; path += "/";
-    if( argc > 2 ) nx = atoi( argv[2] );
+    if( argc > 1 ) nx = atoi( argv[1] );
+    if( argc > 2 ) decompositionDimension = atoi( argv[2] );
     if( argc > 3 ) strongScaling = true;
 
     //////////////////////////////////////////////////////////////////////////
 
+    std::string simulationName ( "MultiGPU_np_" + std::to_string(mpiWorldSize) );
+
+    //////////////////////////////////////////////////////////////////////////
+
     logging::Logger::addStream(&std::cout);
     
     std::ofstream logFile( path + simulationName + "_rank_" + std::to_string(rank) + ".log" );
@@ -434,7 +448,9 @@ int main( int argc, char* argv[])
 
     //////////////////////////////////////////////////////////////////////////
 
+#ifdef USE_CUDA_AWARE_MPI
     MPI_Init(&argc, &argv);
+#endif
     
     //////////////////////////////////////////////////////////////////////////
 
@@ -453,15 +469,15 @@ int main( int argc, char* argv[])
     }
     catch (const std::exception& e)
     {     
-        *logging::out << logging::Logger::ERROR << e.what() << "\n";
+        *logging::out << logging::Logger::LOGGER_ERROR << e.what() << "\n";
     }
     catch (const std::bad_alloc& e)
     {  
-        *logging::out << logging::Logger::ERROR << "Bad Alloc:" << e.what() << "\n";
+        *logging::out << logging::Logger::LOGGER_ERROR << "Bad Alloc:" << e.what() << "\n";
     }
     catch (...)
     {
-        *logging::out << logging::Logger::ERROR << "Unknown exception!\n";
+        *logging::out << logging::Logger::LOGGER_ERROR << "Unknown exception!\n";
     }
 
     //////////////////////////////////////////////////////////////////////////
-- 
GitLab