diff --git a/src/GksGpu/Analyzer/HeatFluxAnalyzer.cu b/src/GksGpu/Analyzer/HeatFluxAnalyzer.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4e8b2d3776fab8eb99b801f240e5a6f8f4f79a5e
--- /dev/null
+++ b/src/GksGpu/Analyzer/HeatFluxAnalyzer.cu
@@ -0,0 +1,147 @@
+#include "HeatFluxAnalyzer.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+
+#include <cmath>
+#include <sstream>
+
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/device_ptr.h>
+
+#include <iomanip>
+
+#include "Core/Logger/Logger.h"
+
+#include "DataBase/DataBase.h"
+
+#include "GksGpu/BoundaryConditions/BoundaryCondition.h"
+
+#include "FlowStateData/AccessDeviceData.cuh"
+#include "FlowStateData/FlowStateDataConversion.cuh"
+
+#include "FluxComputation/SutherlandsLaw.cuh"
+
+#include "CudaUtility/CudaRunKernel.hpp"
+
+__global__                 void heatFluxKernel  ( DataBaseStruct  dataBase, GksGpu::BoundaryConditionStruct  boundaryCondition, Parameters  parameters, real* heatFlux, uint startIndex, uint numberOfEntities );
+__host__ __device__ inline void heatFluxFunction( DataBaseStruct& dataBase, GksGpu::BoundaryConditionStruct& boundaryCondition, Parameters& parameters, real* heatFlux, uint startIndex, uint index );
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+bool HeatFluxAnalyzer::run(uint iter, Parameters parameters)
+{
+    if( iter % this->analyzeIter != 0 ) return false;
+
+    uint numberOfCells = this->boundaryCondition->numberOfCellsPerLevel[ dataBase->numberOfLevels - 1 ];
+
+    thrust::device_vector<real> heatFlux( numberOfCells );
+
+    CudaUtility::CudaGrid grid( numberOfCells, 32 );
+
+    for( uint level = 0; level < dataBase->numberOfLevels - 1; level++ ) parameters.dx *= c1o2; 
+
+    runKernel( heatFluxKernel,
+               heatFluxFunction,
+               dataBase->getDeviceType(), grid, 
+               dataBase->toStruct(),
+               boundaryCondition->toStruct(),
+               parameters,
+               heatFlux.data().get(),
+               boundaryCondition->startOfCellsPerLevel[ dataBase->numberOfLevels - 1 ] );
+
+    getLastCudaError("HeatFluxAnalyzer::run(uint iter)");
+
+    real q = thrust::reduce( heatFlux.begin(), heatFlux.end(), c0o1, thrust::plus<real>() ) * parameters.dx * parameters.dx;
+
+    real qIdeal = c1o4 * (parameters.K + c5o1) * ( parameters.mu / parameters.Pr ) * ( c1o1 / lambdaHot - c1o1 / lambdaCold );
+
+    this->heatFluxTimeSeries.push_back( q / qIdeal );
+
+    if( iter % this->outputIter == 0 ) *logging::out << logging::Logger::INFO_HIGH << "q = " << q / qIdeal << "\n";
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__global__ void heatFluxKernel(DataBaseStruct  dataBase, GksGpu::BoundaryConditionStruct  boundaryCondition, Parameters  parameters, real* heatFlux, uint startIndex, uint numberOfEntities)
+{
+    uint index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if( index >= numberOfEntities ) return;
+
+    heatFluxFunction( dataBase, boundaryCondition, parameters, heatFlux, startIndex, index );
+}
+
+__host__ __device__ void heatFluxFunction(DataBaseStruct& dataBase, GksGpu::BoundaryConditionStruct& boundaryCondition, Parameters& parameters, real* heatFlux, uint startIndex, uint index)
+{
+    uint ghostCellIndex  = boundaryCondition.ghostCells [ startIndex + index ];
+    uint domainCellIndex = boundaryCondition.domainCells[ startIndex + index ];
+
+    if( isCellProperties( dataBase.cellProperties[ domainCellIndex ], CELL_PROPERTIES_GHOST ) )
+    {
+        heatFlux[ startIndex + index ] = c0o1;
+        return;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+
+    ConservedVariables ghostCons;
+
+    readCellData(ghostCellIndex, dataBase, ghostCons);
+
+    ConservedVariables domainCons;
+
+    readCellData(domainCellIndex, dataBase, domainCons);
+
+    PrimitiveVariables ghostPrim  = toPrimitiveVariables(ghostCons,  parameters.K);
+    PrimitiveVariables domainPrim = toPrimitiveVariables(domainCons, parameters.K);
+
+    //////////////////////////////////////////////////////////////////////////
+
+    real lambda = c1o2 * (ghostPrim.lambda + domainPrim.lambda);
+
+    real r   = parameters.lambdaRef / lambda;
+
+    real mu;
+    if ( parameters.viscosityModel == ViscosityModel::constant ){
+        mu = parameters.mu;
+    }
+    else if ( parameters.viscosityModel == ViscosityModel::sutherlandsLaw ){
+        mu = sutherlandsLaw( parameters, r );
+    }
+
+    heatFlux[ startIndex + index ] = c1o4 * (parameters.K + c5o1) * ( mu / parameters.Pr ) / parameters.dx * ( c1o1 / domainPrim.lambda - c1o1 / ghostPrim.lambda );
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+HeatFluxAnalyzer::HeatFluxAnalyzer( SPtr<DataBase> dataBase, SPtr<GksGpu::BoundaryCondition> boundaryCondition, uint analyzeIter, uint outputIter, real lambdaHot, real lambdaCold, real L )
+    : dataBase(dataBase), 
+      boundaryCondition(boundaryCondition), 
+      analyzeIter(analyzeIter), 
+      outputIter(outputIter), 
+      lambdaHot(lambdaHot), 
+      lambdaCold(lambdaCold), 
+      L(L)
+{
+}
+
+void HeatFluxAnalyzer::writeToFile(std::string filename)
+{
+    *logging::out << logging::Logger::INFO_INTERMEDIATE << "HeatFluxAnalyzer::writeToFile( " << filename << " )" << "\n";
+
+    std::ofstream file;
+
+    file.open(filename + ".dat" );
+
+    for( auto& EKin : this->heatFluxTimeSeries )
+        file << std::setprecision(15) << EKin << std::endl;
+
+    file.close();
+
+    *logging::out << logging::Logger::INFO_INTERMEDIATE << "done!\n";
+}
+
+
diff --git a/src/GksGpu/Analyzer/HeatFluxAnalyzer.h b/src/GksGpu/Analyzer/HeatFluxAnalyzer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c94955d80a352f3e83a3ba5964328fcfff24e23
--- /dev/null
+++ b/src/GksGpu/Analyzer/HeatFluxAnalyzer.h
@@ -0,0 +1,48 @@
+#ifndef  HeatFluxAnalyzer_H
+#define  HeatFluxAnalyzer_H
+
+#include <vector>
+#include <string>
+
+#include "VirtualFluidsDefinitions.h"
+
+#include "Core/PointerDefinitions.h"
+#include "Core/DataTypes.h"
+
+#include "GksGpu/BoundaryConditions/BoundaryCondition.h"
+
+#include "FlowStateData/FlowStateData.cuh"
+
+#include "Parameters/Parameters.h"
+
+struct DataBase;
+
+class VF_PUBLIC HeatFluxAnalyzer
+{
+private:
+
+    SPtr<DataBase> dataBase;
+    SPtr<GksGpu::BoundaryCondition> boundaryCondition;
+
+    uint outputIter;
+
+    uint analyzeIter;
+
+    std::vector<real> heatFluxTimeSeries;
+
+    real lambdaHot;
+    real lambdaCold;
+
+    real L;
+
+public:
+
+    HeatFluxAnalyzer( SPtr<DataBase> dataBase, SPtr<GksGpu::BoundaryCondition> boundaryCondition, uint analyzeIter, uint outputIter, real lambdaHot, real lambdaCold, real L );
+
+    bool run( uint iter, Parameters parameters );
+
+    void writeToFile( std::string filename );
+
+};
+
+#endif
diff --git a/targets/apps/GKS/RayleighBenardMultiGPU/3rdPartyLinking.cmake b/targets/apps/GKS/RayleighBenardMultiGPU/3rdPartyLinking.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..72c7afc6076b832263506ab9ce777925cfcc6a66
--- /dev/null
+++ b/targets/apps/GKS/RayleighBenardMultiGPU/3rdPartyLinking.cmake
@@ -0,0 +1,11 @@
+include (${CMAKE_SOURCE_DIR}/${cmakeMacroPath}/MPI/Link.cmake)
+linkMPI(${targetName})
+include (${CMAKE_SOURCE_DIR}/${cmakeMacroPath}/Cuda/Link.cmake)
+linkCuda(${targetName})
+#include (${CMAKE_SOURCE_DIR}/${cmakeMacroPath}/Metis/Link.cmake)
+#linkMetis(${targetName})
+
+#if(HULC.BUILD_JSONCPP)
+#  include (${CMAKE_SOUR#CE_DIR}/${cmakeMacroPath}/JsonCpp/Link.cmake)
+#  linkJsonCpp(${targetName})
+#endif()
diff --git a/targets/apps/GKS/RayleighBenardMultiGPU/CMakeLists.txt b/targets/apps/GKS/RayleighBenardMultiGPU/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d404310177a2f53760d1c84bce79d7d070fed409
--- /dev/null
+++ b/targets/apps/GKS/RayleighBenardMultiGPU/CMakeLists.txt
@@ -0,0 +1,19 @@
+setTargetNameToFolderName(${CMAKE_CURRENT_LIST_DIR})
+
+set(linkDirectories "")
+set(libsToLink Core GridGenerator GksMeshAdapter GksVtkAdapter GksGpu)
+set(includeDirectories "${CMAKE_SOURCE_DIR}/src"
+                       "${CMAKE_SOURCE_DIR}/src/Core"
+                       "${CMAKE_SOURCE_DIR}/src/GridGenerator"
+                       "${CMAKE_SOURCE_DIR}/src/GksMeshAdapter"
+                       "${CMAKE_SOURCE_DIR}/src/GksVtkAdapter"
+                       "${CMAKE_SOURCE_DIR}/src/GksGpu")
+
+#glob files and save in MY_SRCS
+include(CMakePackage.cmake)
+
+buildExe(${targetName} "${MY_SRCS}" "${linkDirectories}" "${libsToLink}" "${includeDirectories}")
+groupTarget(${targetName} ${gksAppFolder})
+
+# Specify the linking to 3rdParty libs
+include(3rdPartyLinking.cmake)
diff --git a/targets/apps/GKS/RayleighBenardMultiGPU/CMakePackage.cmake b/targets/apps/GKS/RayleighBenardMultiGPU/CMakePackage.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..5d39e3804dbd180790629111449a7dc918292430
--- /dev/null
+++ b/targets/apps/GKS/RayleighBenardMultiGPU/CMakePackage.cmake
@@ -0,0 +1,9 @@
+#FILE ENDINGS
+resetFileEndingsToCollect()
+addCAndCPPFileTypes()
+addFileEndingToCollect("*.cu")
+addFileEndingToCollect("*.cuh")
+
+#GLOB SOURCE FILES IN MY_SRCS
+unset(MY_SRCS)
+includeRecursiveAllFilesFrom(${targetName} ${CMAKE_CURRENT_LIST_DIR})
\ No newline at end of file
diff --git a/targets/apps/GKS/RayleighBenardMultiGPU/RayleighBenardMultiGPU.cpp b/targets/apps/GKS/RayleighBenardMultiGPU/RayleighBenardMultiGPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..333917e1d69c82fad7413bd844ba234476293061
--- /dev/null
+++ b/targets/apps/GKS/RayleighBenardMultiGPU/RayleighBenardMultiGPU.cpp
@@ -0,0 +1,632 @@
+//#define MPI_LOGGING
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <exception>
+#include <fstream>
+#include <memory>
+#include <thread>
+
+#include <mpi.h>
+
+#include "Core/Timer/Timer.h"
+#include "Core/PointerDefinitions.h"
+#include "Core/DataTypes.h"
+#include "Core/VectorTypes.h"
+#include "Core/Logger/Logger.h"
+
+#include "GridGenerator/geometries/Cuboid/Cuboid.h"
+#include "GridGenerator/geometries/Conglomerate/Conglomerate.h"
+
+#include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
+#include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
+#include "GridGenerator/grid/GridFactory.h"
+#include "GridGenerator/geometries/BoundingBox/BoundingBox.h"
+#include "GridGenerator/utilities/communication.h"
+
+#include "GksMeshAdapter/GksMeshAdapter.h"
+
+#include "GksVtkAdapter/VTKInterface.h"
+
+#include "GksGpu/DataBase/DataBase.h"
+#include "GksGpu/Parameters/Parameters.h"
+#include "GksGpu/Initializer/Initializer.h"
+
+#include "GksGpu/FlowStateData/FlowStateDataConversion.cuh"
+
+#include "GksGpu/BoundaryConditions/BoundaryCondition.h"
+#include "GksGpu/BoundaryConditions/IsothermalWall.h"
+#include "GksGpu/BoundaryConditions/Periodic.h"
+#include "GksGpu/BoundaryConditions/Pressure.h"
+#include "GksGpu/BoundaryConditions/AdiabaticWall.h"
+
+#include "GksGpu/Communication/Communicator.h"
+#include "GksGpu/Communication/MpiUtility.h"
+
+#include "GksGpu/TimeStepping/NestedTimeStep.h"
+
+#include "GksGpu/Analyzer/CupsAnalyzer.h"
+#include "GksGpu/Analyzer/ConvergenceAnalyzer.h"
+#include "GksGpu/Analyzer/TurbulenceAnalyzer.h"
+#include "GksGpu/Analyzer/PointTimeSeriesCollector.h"
+#include "GksGpu/Analyzer/HeatFluxAnalyzer.h"
+
+#include "GksGpu/Restart/Restart.h"
+
+#include "GksGpu/CudaUtility/CudaUtility.h"
+
+//uint deviceMap [2] = {2,3};
+uint deviceMap [2] = {0,1};
+
+void simulation( std::string path, std::string simulationName, bool fine, uint restartIter )
+{
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    int rank = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    int mpiWorldSize = 1;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiWorldSize);
+
+    int sideLengthX, sideLengthY, sideLengthZ, rankX, rankY, rankZ;
+
+    if      (mpiWorldSize == 1 ) { sideLengthX = 1; sideLengthY = 1; sideLengthZ = 1; }
+    else if (mpiWorldSize == 2 ) { sideLengthX = 1; sideLengthY = 1; sideLengthZ = 2; }
+    else if (mpiWorldSize == 4 ) { sideLengthX = 1; sideLengthY = 2; sideLengthZ = 2; }
+    else if (mpiWorldSize == 8 ) { sideLengthX = 2; sideLengthY = 2; sideLengthZ = 2; }
+    else
+    {
+        throw std::runtime_error( "This number of processes is not supported for this target!" );
+    }
+
+    rankZ =   rank %   sideLengthZ;
+    rankY = ( rank % ( sideLengthZ * sideLengthY ) ) /   sideLengthZ;
+    rankX =   rank                                   / ( sideLengthY * sideLengthZ );
+
+    *logging::out << logging::Logger::INFO_HIGH << "SideLength = " << sideLengthX << " " << sideLengthY << " " << sideLengthZ << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "rank       = " << rankX << " " << rankY << " " << rankZ << "\n";
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    uint nx = 64;
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    real L = 1.0;
+
+    real dx = L / real(nx);
+
+    //real Ra = 1.0e7;
+    real Ra = 1.0e2;
+
+    real Ba  = 0.1;
+    real eps = 0.8;
+    real Pr  = 0.71;
+    real K   = 2.0;
+    
+    real g   = 1.0;
+    real rho = 1.0;
+
+    real lambda     = Ba / ( 2.0 * g * L );
+    real lambdaHot  = lambda / ( 1.0 + eps * 0.5 );
+    real lambdaCold = lambda / ( 1.0 - eps * 0.5 );
+    
+    real mu = sqrt( Pr * eps * g * L * L * L / Ra ) * rho ;
+
+    real cs  = sqrt( ( ( K + 4.0 ) / ( K + 2.0 ) ) / ( 2.0 * lambda ) );
+    real U   = sqrt( Ra ) * mu / ( rho * L );
+
+    real CFL = 0.05;
+
+    real dt  = CFL * ( dx / ( ( U + cs ) * ( c1o1 + ( c2o1 * mu ) / ( U * dx * rho ) ) ) );
+
+    *logging::out << logging::Logger::INFO_HIGH << "dt = " << dt << " s\n";
+    *logging::out << logging::Logger::INFO_HIGH << "U  = " << U  << " s\n";
+    *logging::out << logging::Logger::INFO_HIGH << "mu = " << mu << " s\n";
+
+    //////////////////////////////////////////////////////////////////////////
+
+    Parameters parameters;
+
+    parameters.K  = K;
+    parameters.Pr = Pr;
+    parameters.mu = mu;
+
+    parameters.force.x = 0;
+    parameters.force.y = 0;
+    parameters.force.z = -g;
+
+    parameters.dt = dt;
+    parameters.dx = dx;
+
+    parameters.lambdaRef = lambda;
+
+    //parameters.viscosityModel = ViscosityModel::sutherlandsLaw;
+    parameters.viscosityModel = ViscosityModel::constant;
+
+    parameters.forcingSchemeIdx = 0;
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //                M e s h    G e n e r a t i o n
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    auto gridFactory = GridFactory::make();
+    gridFactory->setGridStrategy(Device::CPU);
+    gridFactory->setTriangularMeshDiscretizationMethod(TriangularMeshDiscretizationMethod::POINT_IN_OBJECT);
+
+    auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    real LX = L / double(sideLengthX);
+    real LY = L / double(sideLengthY);
+    real LZ = L / double(sideLengthZ);
+
+    real xOverlap = ( sideLengthX == 1 ) ? 0.0 : 5.0*dx;
+    real yOverlap = ( sideLengthY == 1 ) ? 0.0 : 5.0*dx;
+    real zOverlap = ( sideLengthZ == 1 ) ? 0.0 : 5.0*dx;
+
+    real startX, endX;
+    real startY, endY;
+    real startZ, endZ;
+
+    if( sideLengthX > 1 && rankX == 1 ) startX = -3.0 * dx;
+    else                                startX = -0.5 * L;
+    if( sideLengthX > 1 && rankX == 0 ) endX   =  3.0 * dx;
+    else                                endX   =  0.5 * L;
+
+    if( sideLengthY > 1 && rankY == 1 ) startY = -3.0 * dx;
+    else                                startY = -0.5 * L;
+    if( sideLengthY > 1 && rankY == 0 ) endY   =  3.0 * dx;
+    else                                endY   =  0.5 * L;
+
+    if( sideLengthZ > 1 && rankZ == 1 ) startZ = -3.0 * dx;
+    else                                startZ = -0.5 * L;
+    if( sideLengthZ > 1 && rankZ == 0 ) endZ   =  3.0 * dx;
+    else                                endZ   =  0.5 * L;
+
+    gridBuilder->addCoarseGrid(startX, startY, startZ,  
+                               endX  , endY  , endZ  , dx);
+
+    std::cout << __LINE__ << std::endl;
+
+    //////////////////////////////////////////////////////////////////////////
+
+    real refL[4] = { 0.05, 0.02, 0.025, 0.005 };
+
+    if( fine )
+    {
+        refL[1] = 0.1;
+        refL[2] = 0.05;
+    }
+
+    gridBuilder->setNumberOfLayers(6,6);
+
+    //////////////////////////////////////////////////////////////////////////
+
+    Conglomerate coarseRefLevel;
+
+    if( sideLengthX == 1 || rankX == 0 ) coarseRefLevel.add( new Cuboid (-100.0, -100.0,           -100.0, 
+                                                                          100.0, -0.5*L + refL[0],  100.0 ) );
+    if( sideLengthX == 1 || rankX == 1 ) coarseRefLevel.add( new Cuboid (-100.0,  0.5*L - refL[0], -100.0, 
+                                                                          100.0,  100.0,            100.0 ) );
+
+    if( sideLengthY == 1 || rankY == 0 ) coarseRefLevel.add( new Cuboid (-100.0,           -100.0, -100.0, 
+                                                                         -0.5*L + refL[0],  100.0,  100.0 ) );
+    if( sideLengthY == 1 || rankY == 1 ) coarseRefLevel.add( new Cuboid ( 0.5*L - refL[0], -100.0, -100.0, 
+                                                                          100.0,            100.0,  100.0  ) );
+
+    if( sideLengthZ == 1 || rankZ == 0 ) coarseRefLevel.add( new Cuboid (-100.0, -100.0, -100.0, 
+                                                                          100.0,  100.0, -0.5*L + refL[0] ) );
+    if( sideLengthZ == 1 || rankZ == 1 ) coarseRefLevel.add( new Cuboid (-100.0, -100.0,  0.5*L - refL[0], 
+                                                                          100.0,  100.0,  100.0           ) );
+
+    gridBuilder->addGrid( &coarseRefLevel, 1);
+
+    //////////////////////////////////////////////////////////////////////////
+
+    Conglomerate firstRefLevel;
+
+    if( sideLengthZ == 1 || rankZ == 0 ) firstRefLevel.add( new Cuboid (-100.0, -100.0, -100.0, 
+                                                                         100.0,  100.0, -0.5*L + refL[1] ) );
+    if( sideLengthZ == 1 || rankZ == 1 ) firstRefLevel.add( new Cuboid (-100.0, -100.0,  0.5*L - refL[1], 
+                                                                         100.0,  100.0,  100.0           ) );
+
+    //gridBuilder->addGrid( &firstRefLevel, 2);
+
+    //////////////////////////////////////////////////////////////////////////
+
+    //Conglomerate secondRefLevel;
+
+    //if( rank % 2 == 0 ) secondRefLevel.add( new Cuboid (-100.0,           -100.0, -100.0, 
+    //                                                    -0.5*L + refL[2],  100.0,  100.0 ) );
+    //else                secondRefLevel.add( new Cuboid ( 0.5*L - refL[2], -100.0, -100.0, 
+    //                                                     100.0,            100.0,  100.0 ) );
+
+    //if( rank % 2 == 0 ) secondRefLevel.add( new Cuboid (-100.0,           -100.0, -100.0,   
+    //                                                    -0.5*L + refL[0],  100.0, -0.5*H + refL[2] ) );
+    //else                secondRefLevel.add( new Cuboid ( 0.5*L - refL[0], -100.0, -100.0,   
+    //                                                     100.0,            100.0, -0.5*H + refL[2] ) );
+
+    //if( rank % 2 == 0 ) secondRefLevel.add( new Cuboid (-100.0,           -100.0,  0.5*H - refL[2], 
+    //                                                    -0.5*L + refL[0],  100.0,  100.0   ) );
+    //else                secondRefLevel.add( new Cuboid ( 0.5*L - refL[0], -100.0,  0.5*H - refL[2], 
+    //                                                     100.0,            100.0,  100.0   ) );
+
+    //gridBuilder->addGrid( &secondRefLevel, 3);
+
+    //////////////////////////////////////////////////////////////////////////
+
+    //Conglomerate thirdRefLevel;
+
+    //if( rank % 2 == 0 ) thirdRefLevel.add( new Cuboid (-100.0,           -100.0, -100.0, 
+    //                                                   -0.5*L + refL[3],  100.0,  100.0 ) );
+    //else                thirdRefLevel.add( new Cuboid ( 0.5*L - refL[3], -100.0, -100.0, 
+    //                                                    100.0,            100.0,  100.0 ) );
+
+    //if( fine ) gridBuilder->addGrid( &thirdRefLevel, 4);
+
+    //////////////////////////////////////////////////////////////////////////
+
+    if( sideLengthX > 1 && rankX == 1 ) startX =    0.0;
+    else                                startX = -100.0;
+    if( sideLengthX > 1 && rankX == 0 ) endX   =    0.0;
+    else                                endX   =  100.0;
+
+    if( sideLengthY > 1 && rankY == 1 ) startY =    0.0;
+    else                                startY = -100.0;
+    if( sideLengthY > 1 && rankY == 0 ) endY   =    0.0;
+    else                                endY   =  100.0;
+
+    if( sideLengthZ > 1 && rankZ == 1 ) startZ =    0.0;
+    else                                startZ = -100.0;
+    if( sideLengthZ > 1 && rankZ == 0 ) endZ   =    0.0;
+    else                                endZ   =  100.0;
+
+    auto subDomainBox = std::make_shared<BoundingBox>( startX, endX, 
+                                                       startY, endY, 
+                                                       startZ, endZ );
+
+    if( mpiWorldSize > 1 ) gridBuilder->setSubDomainBox( subDomainBox );
+
+    //////////////////////////////////////////////////////////////////////////
+
+    gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+
+    gridBuilder->buildGrids(GKS, false);
+            
+    //gridBuilder->writeGridsToVtk( path + simulationName + "_0" + "_rank_" + std::to_string(rank) + "_lev_" );
+
+    //////////////////////////////////////////////////////////////////////////
+
+    if( mpiWorldSize > 1 )
+    {
+        int rankPX = ( (rankX + 1 + sideLengthX) % sideLengthX ) +    rankY                                    * sideLengthX +    rankZ                                    * sideLengthX * sideLengthY;
+        int rankMX = ( (rankX - 1 + sideLengthX) % sideLengthX ) +    rankY                                    * sideLengthX +    rankZ                                    * sideLengthX * sideLengthY;
+        int rankPY =    rankX                                    + ( (rankY + 1 + sideLengthY) % sideLengthY ) * sideLengthX +    rankZ                                    * sideLengthX * sideLengthY;
+        int rankMY =    rankX                                    + ( (rankY - 1 + sideLengthY) % sideLengthY ) * sideLengthX +    rankZ                                    * sideLengthX * sideLengthY;
+        int rankPZ =    rankX                                    +    rankY                                    * sideLengthX + ( (rankZ + 1 + sideLengthZ) % sideLengthZ ) * sideLengthX * sideLengthY;
+        int rankMZ =    rankX                                    +    rankY                                    * sideLengthX + ( (rankZ - 1 + sideLengthZ) % sideLengthZ ) * sideLengthX * sideLengthY;
+
+        if( sideLengthX > 1 && rankX == 0 ) gridBuilder->findCommunicationIndices( CommunicationDirections::PX, GKS );
+        if( sideLengthX > 1 && rankX == 0 ) gridBuilder->setCommunicationProcess ( CommunicationDirections::PX, rankPX);
+
+        if( sideLengthX > 1 && rankX == 1 ) gridBuilder->findCommunicationIndices( CommunicationDirections::MX, GKS );
+        if( sideLengthX > 1 && rankX == 1 ) gridBuilder->setCommunicationProcess ( CommunicationDirections::MX, rankMX);
+
+        if( sideLengthY > 1 && rankY == 0 ) gridBuilder->findCommunicationIndices( CommunicationDirections::PY, GKS );
+        if( sideLengthY > 1 && rankY == 0 ) gridBuilder->setCommunicationProcess ( CommunicationDirections::PY, rankPY);
+
+        if( sideLengthY > 1 && rankY == 1 ) gridBuilder->findCommunicationIndices( CommunicationDirections::MY, GKS );
+        if( sideLengthY > 1 && rankY == 1 ) gridBuilder->setCommunicationProcess ( CommunicationDirections::MY, rankMY);
+
+        if( sideLengthZ > 1 && rankZ == 0 ) gridBuilder->findCommunicationIndices( CommunicationDirections::PZ, GKS );
+        if( sideLengthZ > 1 && rankZ == 0 ) gridBuilder->setCommunicationProcess ( CommunicationDirections::PZ, rankPZ);
+
+        if( sideLengthZ > 1 && rankZ == 1 ) gridBuilder->findCommunicationIndices( CommunicationDirections::MZ, GKS );
+        if( sideLengthZ > 1 && rankZ == 1 ) gridBuilder->setCommunicationProcess ( CommunicationDirections::MZ, rankMZ);
+
+        *logging::out << logging::Logger::INFO_HIGH << "neighborRanks = " << rankPX << " " << rankMX << " " << rankPY << " " << rankMY << " " << rankPZ << " " << rankMZ << "\n";
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    GksMeshAdapter meshAdapter( gridBuilder );
+
+    meshAdapter.inputGrid();
+
+    //if( mpiWorldSize == 2 ) meshAdapter.findPeriodicBoundaryNeighbors();    
+
+    //meshAdapter.writeMeshFaceVTK( path + simulationName + "_0" + "_rank_" + std::to_string(rank) + ".vtk" );
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    auto dataBase = std::make_shared<DataBase>( "GPU" );
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //                 B o u n d a r y    C o n d i t i o n s
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    SPtr<GksGpu::BoundaryCondition> bcMX = std::make_shared<GksGpu::AdiabaticWall>( dataBase, Vec3(0,0,0), false );
+    SPtr<GksGpu::BoundaryCondition> bcPX = std::make_shared<GksGpu::AdiabaticWall>( dataBase, Vec3(0,0,0), false );
+
+    SPtr<GksGpu::BoundaryCondition> bcMY = std::make_shared<GksGpu::AdiabaticWall>( dataBase, Vec3(0,0,0), false );
+    SPtr<GksGpu::BoundaryCondition> bcPY = std::make_shared<GksGpu::AdiabaticWall>( dataBase, Vec3(0,0,0), false );
+
+    bcMX->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.x < -0.5*L; } );
+    bcPX->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.x >  0.5*L; } );
+
+    bcMY->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.y < -0.5*L; } );
+    bcPY->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.y >  0.5*L; } );
+
+    //////////////////////////////////////////////////////////////////////////
+    
+    SPtr<GksGpu::BoundaryCondition> bcMZ = std::make_shared<GksGpu::IsothermalWall>( dataBase, Vec3(0.0, 0.0, 0.0), lambdaHot , false );
+    SPtr<GksGpu::BoundaryCondition> bcPZ = std::make_shared<GksGpu::IsothermalWall>( dataBase, Vec3(0.0, 0.0, 0.0), lambdaCold, false );
+
+    bcMZ->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.z < -0.5*L; } );
+    bcPZ->findBoundaryCells( meshAdapter, true, [&](Vec3 center){ return center.z >  0.5*L; } );
+
+    //////////////////////////////////////////////////////////////////////////
+
+    dataBase->boundaryConditions.push_back( bcMX );
+    dataBase->boundaryConditions.push_back( bcPX );
+
+    dataBase->boundaryConditions.push_back( bcMY );
+    dataBase->boundaryConditions.push_back( bcPY );
+
+    dataBase->boundaryConditions.push_back( bcMZ );
+    dataBase->boundaryConditions.push_back( bcPZ );
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //                 I n i t i a l    C o n d i t i o n s
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    uint startIter = 0;
+
+    dataBase->setMesh( meshAdapter );
+
+    dataBase->setCommunicators( meshAdapter );
+
+    CudaUtility::printCudaMemoryUsage();
+
+    if( restartIter == INVALID_INDEX )
+    {
+        Initializer::interpret(dataBase, [&](Vec3 cellCenter) -> ConservedVariables {
+
+            //real Th = 1.0 / lambdaHot;
+            //real Tc = 1.0 / lambdaCold;
+            //real T = Th - (Th - Tc)*((cellCenter.x + 0.5 * L) / L);
+            //real lambdaLocal = 1.0 / T;
+
+            return toConservedVariables(PrimitiveVariables(rho, 0.0, 0.0, 0.0, lambda), parameters.K);
+        });
+
+        if (rank == 0) writeVtkXMLParallelSummaryFile(dataBase, parameters, path + simulationName + "_0", mpiWorldSize);
+
+        writeVtkXML(dataBase, parameters, 0, path + simulationName + "_0" + "_rank_" + std::to_string(rank));
+    }
+    else
+    {
+        Restart::readRestart( dataBase, path + simulationName + "_" + std::to_string( restartIter ) + "_rank_" + std::to_string(rank), startIter );
+
+        if (rank == 0) writeVtkXMLParallelSummaryFile( dataBase, parameters, path + simulationName + "_" + std::to_string( restartIter ) + "_restart", mpiWorldSize );
+
+        writeVtkXML( dataBase, parameters, 0, path + simulationName + "_" + std::to_string( restartIter ) + "_restart" + "_rank_" + std::to_string(rank) );
+
+
+    }
+
+    dataBase->copyDataHostToDevice();
+
+    Initializer::initializeDataUpdate(dataBase);
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //                  R u n
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    CupsAnalyzer cupsAnalyzer( dataBase, true, 300.0 );
+
+    ConvergenceAnalyzer convergenceAnalyzer( dataBase );
+
+    //auto turbulenceAnalyzer = std::make_shared<TurbulenceAnalyzer>( dataBase, 0 );
+    auto turbulenceAnalyzer = std::make_shared<TurbulenceAnalyzer>( dataBase, 50000 );
+
+    turbulenceAnalyzer->collect_UU = true;
+    turbulenceAnalyzer->collect_VV = true;
+    turbulenceAnalyzer->collect_WW = true;
+    turbulenceAnalyzer->collect_UV = true;
+    turbulenceAnalyzer->collect_UW = true;
+    turbulenceAnalyzer->collect_VW = true;
+
+    turbulenceAnalyzer->allocate();
+
+    if( restartIter != INVALID_INDEX )
+        turbulenceAnalyzer->readRestartFile( path + simulationName + "_Turbulence_" + std::to_string( restartIter ) + "_rank_" + std::to_string(rank) );
+
+    //auto pointTimeSeriesCollector = std::make_shared<PointTimeSeriesCollector>();
+
+    //for( real y = 0.5 * W; y < real( mpiWorldSize / 2 ) * W; y += W )
+    //{
+    //    if( subDomainBox->isInside( -0.485, y, -0.3*H ) ) pointTimeSeriesCollector->addAnalyzer( dataBase, meshAdapter, Vec3( -0.485, y, -0.3*H ), 'W', 10000 );
+    //    if( subDomainBox->isInside( -0.485, y, -0.1*H ) ) pointTimeSeriesCollector->addAnalyzer( dataBase, meshAdapter, Vec3( -0.485, y, -0.1*H ), 'W', 10000 );
+    //    if( subDomainBox->isInside( -0.485, y,  0.1*H ) ) pointTimeSeriesCollector->addAnalyzer( dataBase, meshAdapter, Vec3( -0.485, y,  0.1*H ), 'W', 10000 );
+    //    if( subDomainBox->isInside( -0.485, y,  0.3*H ) ) pointTimeSeriesCollector->addAnalyzer( dataBase, meshAdapter, Vec3( -0.485, y,  0.3*H ), 'W', 10000 );
+    //    
+    //    if( subDomainBox->isInside(  0.485, y, -0.3*H ) ) pointTimeSeriesCollector->addAnalyzer( dataBase, meshAdapter, Vec3(  0.485, y, -0.3*H ), 'W', 10000 );
+    //    if( subDomainBox->isInside(  0.485, y, -0.1*H ) ) pointTimeSeriesCollector->addAnalyzer( dataBase, meshAdapter, Vec3(  0.485, y, -0.1*H ), 'W', 10000 );
+    //    if( subDomainBox->isInside(  0.485, y,  0.1*H ) ) pointTimeSeriesCollector->addAnalyzer( dataBase, meshAdapter, Vec3(  0.485, y,  0.1*H ), 'W', 10000 );
+    //    if( subDomainBox->isInside(  0.485, y,  0.3*H ) ) pointTimeSeriesCollector->addAnalyzer( dataBase, meshAdapter, Vec3(  0.485, y,  0.3*H ), 'W', 10000 );
+    //}
+
+    HeatFluxAnalyzer heatFluxAnalyzerPZ(dataBase, bcPZ, 100, 100, lambdaHot, lambdaCold, L);
+    HeatFluxAnalyzer heatFluxAnalyzerMZ(dataBase, bcMZ, 100, 100, lambdaHot, lambdaCold, L);
+    //HeatFluxAnalyzer heatFluxAnalyzer(dataBase, bcPZ);
+
+    //////////////////////////////////////////////////////////////////////////
+
+    cupsAnalyzer.start();
+
+    for( uint iter = startIter + 1; iter <= 100000000; iter++ )
+    {
+        TimeStepping::nestedTimeStep(dataBase, parameters, 0);
+
+        if( iter % 10000 == 0 )
+        {
+            dataBase->copyDataDeviceToHost();
+
+            if( rank == 0 ) writeVtkXMLParallelSummaryFile( dataBase, parameters, path + simulationName + "_" + std::to_string( iter ), mpiWorldSize );
+
+            writeVtkXML( dataBase, parameters, 0, path + simulationName + "_" + std::to_string( iter ) + "_rank_" + std::to_string(rank) );
+        }
+
+        cupsAnalyzer.run( iter, parameters.dt );
+
+        convergenceAnalyzer.run( iter );
+
+        turbulenceAnalyzer->run( iter, parameters );
+
+        if(rankZ == 1) heatFluxAnalyzerPZ.run( iter, parameters );
+        if(rankZ == 0) heatFluxAnalyzerMZ.run( iter, parameters );
+
+        //pointTimeSeriesCollector->run(iter, parameters);
+
+        if( iter > 50000 && iter % 10000 == 0 )
+        //if(iter % 1000 == 0)
+        {
+            turbulenceAnalyzer->download();
+
+            if( rank == 0 ) writeTurbulenceVtkXMLParallelSummaryFile( dataBase, turbulenceAnalyzer, parameters, path + simulationName + "_Turbulence_" + std::to_string( iter ), mpiWorldSize );
+
+            writeTurbulenceVtkXML( dataBase, turbulenceAnalyzer, 0, path + simulationName + "_Turbulence_" + std::to_string( iter ) + "_rank_" + std::to_string(rank) );
+        }
+
+        if( iter > 50000 && iter % 10000 == 0 )
+        {
+            Restart::writeRestart( dataBase, path + simulationName + "_" + std::to_string( iter ) + "_rank_" + std::to_string(rank), iter );
+
+            turbulenceAnalyzer->writeRestartFile( path + simulationName + "_Turbulence_" + std::to_string( iter ) + "_rank_" + std::to_string(rank) );
+        }
+
+        //if( iter % 1000000 == 0 )
+        //{
+        //    pointTimeSeriesCollector->writeToFile(path + simulationName + "_TimeSeries_" + std::to_string( iter ) + "_rank_" + std::to_string(rank));
+        //}
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+
+    dataBase->copyDataDeviceToHost();
+}
+
+
+
+int main( int argc, char* argv[])
+{
+    //////////////////////////////////////////////////////////////////////////
+
+    bool fine = false;
+
+    bool highAspect = true;
+
+    //////////////////////////////////////////////////////////////////////////
+
+#ifdef _WIN32
+    MPI_Init(&argc, &argv);
+    int rank = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    int mpiWorldSize = 1;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpiWorldSize);
+#else
+    int rank         = MpiUtility::getMpiRankBeforeInit();
+    int mpiWorldSize = MpiUtility::getMpiWorldSizeBeforeInit();
+#endif
+
+    //////////////////////////////////////////////////////////////////////////
+
+#ifdef _WIN32
+    std::string path( "F:/Work/Computations/out/RayleighBenardMultiGPU/" );
+#else
+    std::string path( "out/" );
+#endif
+
+    std::string simulationName ( "ThermalCavity3D" );
+
+    if(fine) simulationName += "_fine";
+    else     simulationName += "_coarse";
+
+    //////////////////////////////////////////////////////////////////////////
+
+    logging::Logger::addStream(&std::cout);
+    
+    std::ofstream logFile( path + simulationName + "_rank_" + std::to_string(rank) + ".log" );
+    logging::Logger::addStream(&logFile);
+
+    logging::Logger::setDebugLevel(logging::Logger::Level::INFO_LOW);
+    logging::Logger::timeStamp(logging::Logger::ENABLE);
+
+    //////////////////////////////////////////////////////////////////////////
+
+    // Important: for Cuda-Aware MPI the device must be set before MPI_Init()
+    int deviceCount = CudaUtility::getCudaDeviceCount();
+
+    if(deviceCount == 0)
+    {
+        std::stringstream msg;
+        msg << "No devices devices found!" << std::endl;
+        *logging::out << logging::Logger::WARNING << msg.str(); msg.str("");
+    }
+
+    CudaUtility::setCudaDevice( rank % deviceCount );
+
+    //////////////////////////////////////////////////////////////////////////
+
+#ifndef _WIN32
+    MPI_Init(&argc, &argv);
+#endif
+
+    //////////////////////////////////////////////////////////////////////////
+
+    if( sizeof(real) == 4 )
+        *logging::out << logging::Logger::INFO_HIGH << "Using Single Precision\n";
+    else
+        *logging::out << logging::Logger::INFO_HIGH << "Using Double Precision\n";
+
+    try
+    {
+        uint restartIter = INVALID_INDEX;
+
+        if( argc > 1 ) restartIter = atoi( argv[1] );
+
+        simulation(path, simulationName, fine, restartIter);
+    }
+    catch (const std::exception& e)
+    {     
+        *logging::out << logging::Logger::LOGGER_ERROR << e.what() << "\n";
+    }
+    catch (const std::bad_alloc& e)
+    {  
+        *logging::out << logging::Logger::LOGGER_ERROR << "Bad Alloc:" << e.what() << "\n";
+    }
+    catch (...)
+    {
+        *logging::out << logging::Logger::LOGGER_ERROR << "Unknown exception!\n";
+    }
+
+    logFile.close();
+
+    MPI_Finalize();
+
+    return 0;
+}
diff --git a/targets/apps/GKS/RayleighBenardMultiGPU/package.include b/targets/apps/GKS/RayleighBenardMultiGPU/package.include
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391