From 48a296c2511f04a31bd1e285a2e01b71e144598b Mon Sep 17 00:00:00 2001
From: "LEGOLAS\\lenz" <lenz@irmb.tu-bs.de>
Date: Thu, 13 Jun 2019 09:46:08 +0200
Subject: [PATCH] nnew MultiGPU Performance Test target

---
 CMakeLists.txt                                |   1 +
 targets/apps/GKS/Flame7cm/Flame7cm.cpp        |  10 +-
 .../GKS/MultiGPU_2D/3rdPartyLinking.cmake     |  11 +
 targets/apps/GKS/MultiGPU_2D/CMakeLists.txt   |  19 +
 .../apps/GKS/MultiGPU_2D/CMakePackage.cmake   |   9 +
 targets/apps/GKS/MultiGPU_2D/MultiGPU_2D.cpp  | 476 ++++++++++++++++++
 targets/apps/GKS/MultiGPU_2D/package.include  |   0
 targets/apps/GKS/PoolFire/PoolFire.cpp        |   6 +-
 8 files changed, 524 insertions(+), 8 deletions(-)
 create mode 100644 targets/apps/GKS/MultiGPU_2D/3rdPartyLinking.cmake
 create mode 100644 targets/apps/GKS/MultiGPU_2D/CMakeLists.txt
 create mode 100644 targets/apps/GKS/MultiGPU_2D/CMakePackage.cmake
 create mode 100644 targets/apps/GKS/MultiGPU_2D/MultiGPU_2D.cpp
 create mode 100644 targets/apps/GKS/MultiGPU_2D/package.include

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 666d84826..408a8b255 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -172,6 +172,7 @@ IF (HULC.BUILD_VF_GKS)
     add_subdirectory(targets/apps/GKS/Candle)
     
     add_subdirectory(targets/apps/GKS/MultiGPU)
+    add_subdirectory(targets/apps/GKS/MultiGPU_2D)
 ELSE()
   MESSAGE( STATUS "exclude Virtual Fluids GKS." )
 ENDIF()
diff --git a/targets/apps/GKS/Flame7cm/Flame7cm.cpp b/targets/apps/GKS/Flame7cm/Flame7cm.cpp
index 2972fcaf9..fe7116093 100644
--- a/targets/apps/GKS/Flame7cm/Flame7cm.cpp
+++ b/targets/apps/GKS/Flame7cm/Flame7cm.cpp
@@ -187,8 +187,8 @@ void thermalCavity( std::string path, std::string simulationName )
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     
-    SPtr<BoundaryCondition> bcMX = std::make_shared<Open>( dataBase, prim );
-    SPtr<BoundaryCondition> bcPX = std::make_shared<Open>( dataBase, prim );
+    SPtr<BoundaryCondition> bcMX = std::make_shared<Open>( dataBase, prim, 10.0 );
+    SPtr<BoundaryCondition> bcPX = std::make_shared<Open>( dataBase, prim, 10.0 );
     //SPtr<BoundaryCondition> bcMX = std::make_shared<AdiabaticWall>( dataBase, Vec3(0, 0, 0), false );
     //SPtr<BoundaryCondition> bcPX = std::make_shared<AdiabaticWall>( dataBase, Vec3(0, 0, 0), false );
 
@@ -208,8 +208,8 @@ void thermalCavity( std::string path, std::string simulationName )
 
     if( threeDimensional )
     {
-        bcMY = std::make_shared<Open>( dataBase, prim );
-        bcPY = std::make_shared<Open>( dataBase, prim );
+        bcMY = std::make_shared<Open>( dataBase, prim, 10.0 );
+        bcPY = std::make_shared<Open>( dataBase, prim, 10.0 );
 
         bcMY->findBoundaryCells( meshAdapter, false, [&](Vec3 center){ return center.y < -0.5*L; } );
         bcPY->findBoundaryCells( meshAdapter, false, [&](Vec3 center){ return center.y >  0.5*L; } );
@@ -230,7 +230,7 @@ void thermalCavity( std::string path, std::string simulationName )
     //SPtr<BoundaryCondition> bcMZ = std::make_shared<InflowComplete>( dataBase, PrimitiveVariables(rho, 0.0, 0.0, 0.0, prim.lambda, 0.0, 0.0) );
     //SPtr<BoundaryCondition> bcMZ = std::make_shared<Open>( dataBase );
 
-    SPtr<BoundaryCondition> bcPZ = std::make_shared<Open>( dataBase, prim );
+    SPtr<BoundaryCondition> bcPZ = std::make_shared<Open>( dataBase, prim, 10.0 );
     
     bcMZ->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.z < 0.0 /*&& std::sqrt(center.x*center.x + center.y*center.y) >= 0.5*0.071*/; } );
     bcPZ->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.z > H  ; } );
diff --git a/targets/apps/GKS/MultiGPU_2D/3rdPartyLinking.cmake b/targets/apps/GKS/MultiGPU_2D/3rdPartyLinking.cmake
new file mode 100644
index 000000000..72c7afc60
--- /dev/null
+++ b/targets/apps/GKS/MultiGPU_2D/3rdPartyLinking.cmake
@@ -0,0 +1,11 @@
+include (${CMAKE_SOURCE_DIR}/${cmakeMacroPath}/MPI/Link.cmake)
+linkMPI(${targetName})
+include (${CMAKE_SOURCE_DIR}/${cmakeMacroPath}/Cuda/Link.cmake)
+linkCuda(${targetName})
+#include (${CMAKE_SOURCE_DIR}/${cmakeMacroPath}/Metis/Link.cmake)
+#linkMetis(${targetName})
+
+#if(HULC.BUILD_JSONCPP)
+#  include (${CMAKE_SOUR#CE_DIR}/${cmakeMacroPath}/JsonCpp/Link.cmake)
+#  linkJsonCpp(${targetName})
+#endif()
diff --git a/targets/apps/GKS/MultiGPU_2D/CMakeLists.txt b/targets/apps/GKS/MultiGPU_2D/CMakeLists.txt
new file mode 100644
index 000000000..d40431017
--- /dev/null
+++ b/targets/apps/GKS/MultiGPU_2D/CMakeLists.txt
@@ -0,0 +1,19 @@
+setTargetNameToFolderName(${CMAKE_CURRENT_LIST_DIR})
+
+set(linkDirectories "")
+set(libsToLink Core GridGenerator GksMeshAdapter GksVtkAdapter GksGpu)
+set(includeDirectories "${CMAKE_SOURCE_DIR}/src"
+                       "${CMAKE_SOURCE_DIR}/src/Core"
+                       "${CMAKE_SOURCE_DIR}/src/GridGenerator"
+                       "${CMAKE_SOURCE_DIR}/src/GksMeshAdapter"
+                       "${CMAKE_SOURCE_DIR}/src/GksVtkAdapter"
+                       "${CMAKE_SOURCE_DIR}/src/GksGpu")
+
+#glob files and save in MY_SRCS
+include(CMakePackage.cmake)
+
+buildExe(${targetName} "${MY_SRCS}" "${linkDirectories}" "${libsToLink}" "${includeDirectories}")
+groupTarget(${targetName} ${gksAppFolder})
+
+# Specify the linking to 3rdParty libs
+include(3rdPartyLinking.cmake)
diff --git a/targets/apps/GKS/MultiGPU_2D/CMakePackage.cmake b/targets/apps/GKS/MultiGPU_2D/CMakePackage.cmake
new file mode 100644
index 000000000..5d39e3804
--- /dev/null
+++ b/targets/apps/GKS/MultiGPU_2D/CMakePackage.cmake
@@ -0,0 +1,9 @@
+#FILE ENDINGS
+resetFileEndingsToCollect()
+addCAndCPPFileTypes()
+addFileEndingToCollect("*.cu")
+addFileEndingToCollect("*.cuh")
+
+#GLOB SOURCE FILES IN MY_SRCS
+unset(MY_SRCS)
+includeRecursiveAllFilesFrom(${targetName} ${CMAKE_CURRENT_LIST_DIR})
\ No newline at end of file
diff --git a/targets/apps/GKS/MultiGPU_2D/MultiGPU_2D.cpp b/targets/apps/GKS/MultiGPU_2D/MultiGPU_2D.cpp
new file mode 100644
index 000000000..0288c6e50
--- /dev/null
+++ b/targets/apps/GKS/MultiGPU_2D/MultiGPU_2D.cpp
@@ -0,0 +1,476 @@
+//#define MPI_LOGGING
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+#include <string>
+#include <iostream>
+#include <exception>
+#include <fstream>
+#include <sstream>
+#include <memory>
+
+#include "Core/Timer/Timer.h"
+#include "Core/PointerDefinitions.h"
+#include "Core/DataTypes.h"
+#include "Core/VectorTypes.h"
+#include "Core/Logger/Logger.h"
+
+#include "GridGenerator/geometries/Cuboid/Cuboid.h"
+#include "GridGenerator/geometries/Sphere/Sphere.h"
+#include "GridGenerator/geometries/VerticalCylinder/VerticalCylinder.h"
+#include "GridGenerator/geometries/Conglomerate/Conglomerate.h"
+
+#include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
+#include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
+#include "GridGenerator/grid/GridFactory.h"
+#include "GridGenerator/geometries/BoundingBox/BoundingBox.h"
+#include "GridGenerator/utilities/communication.h"
+
+#include "GksMeshAdapter/GksMeshAdapter.h"
+
+#include "GksVtkAdapter/VTKInterface.h"
+
+#include "GksGpu/DataBase/DataBase.h"
+#include "GksGpu/Parameters/Parameters.h"
+#include "GksGpu/Initializer/Initializer.h"
+
+#include "GksGpu/FlowStateData/FlowStateData.cuh"
+#include "GksGpu/FlowStateData/FlowStateDataConversion.cuh"
+
+#include "GksGpu/BoundaryConditions/BoundaryCondition.h"
+#include "GksGpu/BoundaryConditions/IsothermalWall.h"
+#include "GksGpu/BoundaryConditions/Periodic.h"
+#include "GksGpu/BoundaryConditions/Pressure.h"
+#include "GksGpu/BoundaryConditions/AdiabaticWall.h"
+
+#include "GksGpu/Communication/Communicator.h"
+
+#include "GksGpu/TimeStepping/NestedTimeStep.h"
+
+#include "GksGpu/Analyzer/CupsAnalyzer.h"
+#include "GksGpu/Analyzer/ConvergenceAnalyzer.h"
+#include "GksGpu/Analyzer/TurbulenceAnalyzer.h"
+
+#include "GksGpu/CudaUtility/CudaUtility.h"
+#include "GksGpu/Communication/MpiUtility.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+void performanceTest( std::string path, std::string simulationName, uint decompositionDimension, uint nx, bool strongScaling )
+{
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    int rank = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    int mpiWorldSize = 1;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiWorldSize);
+
+    //CudaUtility::setCudaDevice(rank % devicesPerNode);
+    
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    int sideLength, rankX, rankY, rankZ;
+
+    if( decompositionDimension == 2 )
+    {
+        if      (mpiWorldSize == 1)  sideLength = 1;
+        else if (mpiWorldSize == 4)  sideLength = 2;
+        else if (mpiWorldSize == 16) sideLength = 4;
+
+        rankX = rank % sideLength;
+        rankY = rank / sideLength;
+        rankZ = 0;
+    }
+    else if( decompositionDimension == 3 )
+    {
+        if      (mpiWorldSize == 1)  sideLength = 1;
+        else if (mpiWorldSize == 8)  sideLength = 2;
+
+        rankX =   rank % sideLength;
+        rankY = ( rank % ( sideLength * sideLength ) ) / sideLength;
+        rankZ =   rank                                 / ( sideLength * sideLength );
+    }
+
+    *logging::out << logging::Logger::INFO_HIGH << sideLength << " " << rankX << " " << rankY << " " << rankZ << "\n";
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    real L  = 1.0;
+
+    real LX = L;
+    real LY = L;
+    real LZ = L;
+
+    real dx = L / real(nx);
+
+    if( strongScaling )
+    {
+        if( decompositionDimension == 2 )
+        {
+            LX /= double(sideLength);
+            LY /= double(sideLength);
+        }
+        else if( decompositionDimension == 3 )
+        {
+            LX /= double(sideLength);
+            LY /= double(sideLength);
+            LZ /= double(sideLength);
+        }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+
+    Parameters parameters;
+
+    parameters.K  = 0;
+    parameters.Pr = 1;
+    parameters.mu = 0.01;
+
+    parameters.force.x = 0.1;
+    parameters.force.y = 0;
+    parameters.force.z = 0;
+
+    parameters.dt = 0.0001;
+    parameters.dx = dx;
+
+    parameters.lambdaRef = 1.0e-2;
+    
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    auto gridFactory = GridFactory::make();
+    gridFactory->setGridStrategy(Device::CPU);
+    gridFactory->setTriangularMeshDiscretizationMethod(TriangularMeshDiscretizationMethod::POINT_IN_OBJECT);
+
+    auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    if( mpiWorldSize > 1 )
+    {
+        //if( decompositionDimension == 2 )
+        //{
+            gridBuilder->addCoarseGrid(  rankX*LX    - 0.5*L - 5.0*dx,      rankY*LY    - 0.5*L - 5.0*dx,      rankZ*LZ    - 0.5*L - 5.0*dx,
+                                        (rankX*LX+1) - 0.5*L + 5.0*dx,     (rankY*LY+1) - 0.5*L + 5.0*dx,     (rankZ*LZ+1) - 0.5*L + 5.0*dx, dx);
+        //}
+
+        //if( decompositionDimension == 3 )
+        //{
+        //    gridBuilder->addCoarseGrid( rankX*LX - 0.5*L - 5.0*dx,     rankY*LY - 0.5*L - 5.0*dx,     rankZ*LZ - 0.5*L - 5.0*dx,  
+        //                                rankX*LX + 0.5*L + 5.0*dx,     rankY*LY + 0.5*L + 5.0*dx,     rankZ*LZ + 0.5*L + 5.0*dx, dx);
+        //}
+
+        gridBuilder->setSubDomainBox( std::make_shared<BoundingBox>( rankX*LX - 0.5*L, (rankX+1)*LX - 0.5*L, 
+                                                                     rankY*LY - 0.5*L, (rankY+1)*LY - 0.5*L,
+                                                                     rankZ*LZ - 0.5*L, (rankZ+1)*LZ - 0.5*L  ) );
+    }
+    else
+    {
+        gridBuilder->addCoarseGrid( -0.5*L, -0.5*L, -0.5*L,  
+                                     0.5*L,  0.5*L,  0.5*L, dx);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    gridBuilder->setPeriodicBoundaryCondition(true, true, true);
+
+    gridBuilder->buildGrids(GKS, false);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    if( decompositionDimension == 1 && mpiWorldSize > 1 )
+    {
+        gridBuilder->findCommunicationIndices( CommunicationDirections::PX, GKS );
+        gridBuilder->setCommunicationProcess ( CommunicationDirections::PX, (rank + 1 + mpiWorldSize) % mpiWorldSize );
+
+        gridBuilder->findCommunicationIndices( CommunicationDirections::MX, GKS );
+        gridBuilder->setCommunicationProcess ( CommunicationDirections::MX, (rank - 1 + mpiWorldSize) % mpiWorldSize );
+    }
+    else if ( decompositionDimension == 2 && mpiWorldSize > 1 )
+    {
+        int rankPX = ( (rankX + 1 + sideLength) % sideLength ) +    rankY                                  * sideLength;
+        int rankMX = ( (rankX - 1 + sideLength) % sideLength ) +    rankY                                  * sideLength;
+        int rankPY =    rankX                                  + ( (rankY + 1 + sideLength) % sideLength ) * sideLength;
+        int rankMY =    rankX                                  + ( (rankY - 1 + sideLength) % sideLength ) * sideLength;
+
+        gridBuilder->findCommunicationIndices( CommunicationDirections::PX, GKS );
+        gridBuilder->setCommunicationProcess ( CommunicationDirections::PX, rankPX);
+
+        gridBuilder->findCommunicationIndices( CommunicationDirections::MX, GKS );
+        gridBuilder->setCommunicationProcess ( CommunicationDirections::MX, rankMX);
+
+        gridBuilder->findCommunicationIndices( CommunicationDirections::PY, GKS );
+        gridBuilder->setCommunicationProcess ( CommunicationDirections::PY, rankPY);
+
+        gridBuilder->findCommunicationIndices( CommunicationDirections::MY, GKS );
+        gridBuilder->setCommunicationProcess ( CommunicationDirections::MY, rankMY);
+    }
+    else if ( decompositionDimension == 3 && mpiWorldSize > 1 )
+    {
+        int rankPX = ( (rankX + 1 + sideLength) % sideLength ) +    rankY                                  * sideLength +    rankZ                                  * sideLength * sideLength;
+        int rankMX = ( (rankX - 1 + sideLength) % sideLength ) +    rankY                                  * sideLength +    rankZ                                  * sideLength * sideLength;
+        int rankPY =    rankX                                  + ( (rankY + 1 + sideLength) % sideLength ) * sideLength +    rankZ                                  * sideLength * sideLength;
+        int rankMY =    rankX                                  + ( (rankY - 1 + sideLength) % sideLength ) * sideLength +    rankZ                                  * sideLength * sideLength;
+        int rankPZ =    rankX                                  +    rankY                                  * sideLength + ( (rankZ + 1 + sideLength) % sideLength ) * sideLength * sideLength;
+        int rankMZ =    rankX                                  +    rankY                                  * sideLength + ( (rankZ - 1 + sideLength) % sideLength ) * sideLength * sideLength;
+
+        gridBuilder->findCommunicationIndices( CommunicationDirections::PX, GKS );
+        gridBuilder->setCommunicationProcess ( CommunicationDirections::PX, rankPX);
+
+        gridBuilder->findCommunicationIndices( CommunicationDirections::MX, GKS );
+        gridBuilder->setCommunicationProcess ( CommunicationDirections::MX, rankMX);
+
+        gridBuilder->findCommunicationIndices( CommunicationDirections::PY, GKS );
+        gridBuilder->setCommunicationProcess ( CommunicationDirections::PY, rankPY);
+
+        gridBuilder->findCommunicationIndices( CommunicationDirections::MY, GKS );
+        gridBuilder->setCommunicationProcess ( CommunicationDirections::MY, rankMY);
+
+        gridBuilder->findCommunicationIndices( CommunicationDirections::PZ, GKS );
+        gridBuilder->setCommunicationProcess ( CommunicationDirections::PZ, rankPZ);
+
+        gridBuilder->findCommunicationIndices( CommunicationDirections::MZ, GKS );
+        gridBuilder->setCommunicationProcess ( CommunicationDirections::MZ, rankMZ);
+
+        *logging::out << logging::Logger::INFO_HIGH << rankPX << " " << rankMX << " " << rankPY << " " << rankMY << "\n";
+    }
+
+    gridBuilder->writeGridsToVtk(path + "/Grid_rank_" + std::to_string(rank) + "_lev_");
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    GksMeshAdapter meshAdapter( gridBuilder );
+
+    meshAdapter.inputGrid();
+
+    meshAdapter.getCommunicationIndices();
+
+    meshAdapter.findPeriodicBoundaryNeighbors();
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    auto dataBase = std::make_shared<DataBase>( "GPU" );
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    SPtr<BoundaryCondition> bcMX = std::make_shared<Periodic>( dataBase );
+    SPtr<BoundaryCondition> bcPX = std::make_shared<Periodic>( dataBase );
+    //SPtr<BoundaryCondition> bcMX = std::make_shared<AdiabaticWall>( dataBase, Vec3(0.0, 0.0, 0.0), false );
+    //SPtr<BoundaryCondition> bcPX = std::make_shared<AdiabaticWall>( dataBase, Vec3(0.0, 0.1, 0.0), false );
+
+    bcMX->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.x < -0.5*L; } );
+    bcPX->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.x >  0.5*L; } );
+
+    //////////////////////////////////////////////////////////////////////////
+
+    //SPtr<BoundaryCondition> bcMY = std::make_shared<Periodic>( dataBase );
+    //SPtr<BoundaryCondition> bcPY = std::make_shared<Periodic>( dataBase );
+    SPtr<BoundaryCondition> bcMY = std::make_shared<Periodic>( dataBase );
+    SPtr<BoundaryCondition> bcPY = std::make_shared<Periodic>( dataBase );
+
+    bcMY->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.y < -0.5*L; } );
+    bcPY->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.y >  0.5*L; } );
+
+    //////////////////////////////////////////////////////////////////////////
+    
+    //SPtr<BoundaryCondition> bcMZ = std::make_shared<Periodic>( dataBase );
+    //SPtr<BoundaryCondition> bcPZ = std::make_shared<Periodic>( dataBase );
+    SPtr<BoundaryCondition> bcMZ = std::make_shared<Periodic>( dataBase );
+    SPtr<BoundaryCondition> bcPZ = std::make_shared<Periodic>( dataBase );
+    
+    bcMZ->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.z < -0.5*L; } );
+    bcPZ->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.z >  0.5*L; } );
+
+    //////////////////////////////////////////////////////////////////////////
+
+    if( mpiWorldSize == 1 )
+    {
+        dataBase->boundaryConditions.push_back( bcMX );
+        dataBase->boundaryConditions.push_back( bcPX );
+    
+        dataBase->boundaryConditions.push_back( bcMY );
+        dataBase->boundaryConditions.push_back( bcPY );
+    }
+    if( decompositionDimension == 2 || mpiWorldSize == 1 )
+    {
+        dataBase->boundaryConditions.push_back( bcMZ );
+        dataBase->boundaryConditions.push_back( bcPZ );
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+
+    *logging::out << logging::Logger::INFO_HIGH << "bcMX ==> " << bcMX->numberOfCellsPerLevel[0] << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "bcPX ==> " << bcPX->numberOfCellsPerLevel[0] << "\n";
+
+    *logging::out << logging::Logger::INFO_HIGH << "bcMY ==> " << bcMY->numberOfCellsPerLevel[0] << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "bcPY ==> " << bcPY->numberOfCellsPerLevel[0] << "\n";
+
+    *logging::out << logging::Logger::INFO_HIGH << "bcMZ ==> " << bcMZ->numberOfCellsPerLevel[0] << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "bcPZ ==> " << bcPZ->numberOfCellsPerLevel[0] << "\n";
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    dataBase->setMesh( meshAdapter );
+
+    dataBase->setCommunicators( meshAdapter );
+    
+    //*logging::out << logging::Logger::WARNING << int(dataBase->communicators[0].size()) << "\n";
+    //*logging::out << logging::Logger::WARNING << int(dataBase->communicators[0][0].get()) << "\n";
+    //*logging::out << logging::Logger::WARNING << int(dataBase->communicators[0][1].get()) << "\n";
+
+    CudaUtility::printCudaMemoryUsage();
+
+    Initializer::interpret(dataBase, [&] ( Vec3 cellCenter ) -> ConservedVariables
+    {
+        return toConservedVariables( PrimitiveVariables( 1.0, 1.0, 0.0, 0.0, parameters.lambdaRef ), parameters.K );
+    });
+
+    dataBase->copyDataHostToDevice();
+
+    for( auto bc : dataBase->boundaryConditions ) 
+        for( uint level = 0; level < dataBase->numberOfLevels; level++ )
+            bc->runBoundaryConditionKernel( dataBase, parameters, level );
+
+    Initializer::initializeDataUpdate(dataBase);
+
+    dataBase->copyDataDeviceToHost();
+
+    if( rank == 0 ) writeVtkXMLParallelSummaryFile( dataBase, parameters, path + simulationName + "_0", mpiWorldSize );
+
+    writeVtkXML( dataBase, parameters, 0, path + simulationName + "_0" + "_rank_" + std::to_string(rank) );
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    CupsAnalyzer cupsAnalyzer( dataBase, true, 30.0 );
+
+    //////////////////////////////////////////////////////////////////////////
+
+    cupsAnalyzer.start();
+
+    for( uint iter = 1; iter <= 10000; iter++ )
+    {
+        TimeStepping::nestedTimeStep(dataBase, parameters, 0);
+
+        cupsAnalyzer.run( iter );
+
+        if( iter % 10000 == 0 )
+        {
+            dataBase->copyDataDeviceToHost();
+
+            if( rank == 0 ) writeVtkXMLParallelSummaryFile( dataBase, parameters, path + simulationName + "_" + std::to_string( iter ), mpiWorldSize );
+
+            writeVtkXML( dataBase, parameters, 0, path + simulationName + "_" + std::to_string( iter ) + "_rank_" + std::to_string(rank) );
+        }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+
+    dataBase->copyDataDeviceToHost();
+}
+
+int main( int argc, char* argv[])
+{
+    //////////////////////////////////////////////////////////////////////////
+
+    int rank         = MpiUtility::getMpiRankBeforeInit();
+    int mpiWorldSize = MpiUtility::getMpiWorldSizeBeforeInit();
+
+    //////////////////////////////////////////////////////////////////////////
+
+#ifdef _WIN32
+    std::string path( "F:/Work/Computations/out/MultiGPU/" );
+#else
+    //std::string path( "/home/stephan/Computations/out/" );
+    std::string path( "out/" );
+#endif
+
+    std::string simulationName ( "MultiGPU_np_" + std::to_string(mpiWorldSize) );
+
+    //////////////////////////////////////////////////////////////////////////
+
+    bool strongScaling = false;
+    uint nx = 64;
+
+    uint decompositionDimension = 2;
+
+    if( argc > 1 ) path += argv[1]; path += "/";
+    if( argc > 2 ) nx = atoi( argv[2] );
+    if( argc > 3 ) strongScaling = true;
+
+    //////////////////////////////////////////////////////////////////////////
+
+    logging::Logger::addStream(&std::cout);
+    
+    std::ofstream logFile( path + simulationName + "_rank_" + std::to_string(rank) + ".log" );
+    logging::Logger::addStream(&logFile);
+
+    logging::Logger::setDebugLevel(logging::Logger::Level::INFO_LOW);
+    logging::Logger::timeStamp(logging::Logger::ENABLE);
+
+    //////////////////////////////////////////////////////////////////////////
+
+    // Important: for Cuda-Aware MPI the device must be set before MPI_Init()
+    int deviceCount = CudaUtility::getCudaDeviceCount();
+
+    if(deviceCount == 0)
+    {
+        std::stringstream msg;
+        msg << "No devices devices found!" << std::endl;
+        *logging::out << logging::Logger::WARNING << msg.str(); msg.str("");
+    }
+
+    CudaUtility::setCudaDevice( rank % deviceCount );
+
+    //////////////////////////////////////////////////////////////////////////
+
+    MPI_Init(&argc, &argv);
+    
+    //////////////////////////////////////////////////////////////////////////
+
+    if( sizeof(real) == 4 )
+        *logging::out << logging::Logger::INFO_HIGH << "Using Single Precison\n";
+    else
+        *logging::out << logging::Logger::INFO_HIGH << "Using Double Precision\n";
+
+    //////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
+
+    try
+    {
+        performanceTest( path, simulationName, decompositionDimension, nx, strongScaling );
+    }
+    catch (const std::exception& e)
+    {     
+        *logging::out << logging::Logger::ERROR << e.what() << "\n";
+    }
+    catch (const std::bad_alloc& e)
+    {  
+        *logging::out << logging::Logger::ERROR << "Bad Alloc:" << e.what() << "\n";
+    }
+    catch (...)
+    {
+        *logging::out << logging::Logger::ERROR << "Unknown exception!\n";
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
+
+    logFile.close();
+
+    MPI_Finalize();
+
+   return 0;
+}
diff --git a/targets/apps/GKS/MultiGPU_2D/package.include b/targets/apps/GKS/MultiGPU_2D/package.include
new file mode 100644
index 000000000..e69de29bb
diff --git a/targets/apps/GKS/PoolFire/PoolFire.cpp b/targets/apps/GKS/PoolFire/PoolFire.cpp
index 78f6b796e..a6abbe068 100644
--- a/targets/apps/GKS/PoolFire/PoolFire.cpp
+++ b/targets/apps/GKS/PoolFire/PoolFire.cpp
@@ -141,7 +141,7 @@ void thermalCavity( std::string path, std::string simulationName, uint restartIt
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    bool threeDimensional = true;
+    bool threeDimensional = false;
 
     if( threeDimensional )
     {
@@ -409,7 +409,7 @@ void thermalCavity( std::string path, std::string simulationName, uint restartIt
 
         if( 
             //( iter >= 100 && iter % 10 == 0 ) || 
-            ( iter % 10000 == 0 )
+            ( iter % 400 == 0 )
           )
         {
             dataBase->copyDataDeviceToHost();
@@ -417,7 +417,7 @@ void thermalCavity( std::string path, std::string simulationName, uint restartIt
             writeVtkXML( dataBase, parameters, 0, path + simulationName + "_" + std::to_string( iter ) );
         }
 
-        if( iter % 10000 == 0 )
+        if( iter % 4000 == 0 )
         {
             Restart::writeRestart( dataBase, path + simulationName + "_" + std::to_string( iter ), iter );
         }
-- 
GitLab