diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index f78e30c81f1c787e4d5748a0c624e561521f0922..2854c1f6784fd5a87a8dbc1031d01b06b16e4325 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,4 +1,4 @@
-{
+{  
     "name": "virtual-fluids-environment",
     "extensions": [
         "mhutchie.git-graph",
@@ -7,7 +7,8 @@
         "ms-vscode.cpptools",
         "visualstudioexptteam.vscodeintellicode",
         "xaver.clang-format",
-        "notskm.clang-tidy"
+        "notskm.clang-tidy",
+        "streetsidesoftware.code-spell-checker"
     ],
     "runArgs": ["--gpus","all"],
     "image": "git.rz.tu-bs.de:4567/irmb/virtualfluids/ubuntu20_04:1.2"
diff --git a/.gitignore b/.gitignore
index 285f67f351ae8abf4d088f0364417b2f55d4883d..d16238c4ac7d45440117af9bc7ac0479a90dae2d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,20 +13,19 @@ __pycache__/
 
 # IDE
 .vscode/
-.devcontainer/
+.vscode-server/
 .sync/
 .idea/
-.vscode-server
 
-# Simulation results
+# simulation results
 output/
 logs/
 
-# Input files
-stl/
-
-# Scripts
+# scripts
 scripts/
 
+# stl files
+stl/
+
 # MacOS
 .DS_Store
diff --git a/CMake/cmake_config_files/TESLA03.config.cmake b/CMake/cmake_config_files/TESLA03.config.cmake
index 91672511794eeb452f97a1aca952416c7e1179d5..f319e7bff9de9645d72b1598cec77652375b4d07 100644
--- a/CMake/cmake_config_files/TESLA03.config.cmake
+++ b/CMake/cmake_config_files/TESLA03.config.cmake
@@ -10,10 +10,7 @@ SET(GMOCK_ROOT ${CMAKE_SOURCE_DIR}/3rdParty/googletest CACHE PATH "GMOCK ROOT")
 SET(JSONCPP_ROOT ${CMAKE_SOURCE_DIR}/3rdParty/jsoncpp CACHE PATH "JSONCPP ROOT")
 SET(FFTW_ROOT ${CMAKE_SOURCE_DIR}/3rdParty/fftw/fftw-3.3.7 CACHE PATH "JSONCPP ROOT")
 
-
-#SET TO CORRECT PATH:
-SET(BOOST_ROOT  "F:/boost/boost_1_74_0"  CACHE PATH "BOOST_ROOT")
-SET(BOOST_LIBRARYDIR  "F:/boost/boost_1_74_0/stageMSVC64VS2019/lib" CACHE PATH "BOOST_LIBRARYDIR")
+SET(CMAKE_CUDA_ARCHITECTURES 52)
 
 SET(VTK_DIR "F:/Libraries/vtk/VTK-8.2.0/build" CACHE PATH "VTK directory override" FORCE)
 
diff --git a/apps/gpu/HULC/main.cpp b/apps/gpu/HULC/main.cpp
index 8a0e6f6a8363e579ad0821d8b8f9995e11eabc3e..35d70d1f49b344e0e5530c46582940ec581e3c7f 100644
--- a/apps/gpu/HULC/main.cpp
+++ b/apps/gpu/HULC/main.cpp
@@ -335,7 +335,7 @@ void multipleLevel(const std::string& configPath)
     //gridBuilder->writeGridToVTK("D:/GRIDGENERATION/gridTest_level_2", 2);
 
     SPtr<Parameter> para = Parameter::make();
-    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para);
+    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, communicator);
     //SPtr<GridProvider> gridGenerator = GridProvider::makeGridReader(false, para);
 
     std::ifstream stream;
diff --git a/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp b/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
index 6be64950710c53b3c7931180a9beb1368a615fe3..785ef2b69609774d3e0d8eafd12fb29683715afb 100644
--- a/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
+++ b/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
@@ -179,7 +179,7 @@ void multipleLevel(const std::string& configPath)
 
     SPtr<CudaMemoryManager> cudaMemoryManager = CudaMemoryManager::make(para);
 
-    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
 
     real turbPos[3] = {3*reference_diameter, 3*reference_diameter, 3*reference_diameter};
     real epsilon = 5.f; // width of gaussian smearing
diff --git a/apps/gpu/LBM/Basel/main.cpp b/apps/gpu/LBM/Basel/main.cpp
index 8d09c8a7a6996eecb5be84be54fd72f8ca1b014a..949b986ab28619deb3b5a9cf51ba6cf642921b1b 100644
--- a/apps/gpu/LBM/Basel/main.cpp
+++ b/apps/gpu/LBM/Basel/main.cpp
@@ -181,7 +181,7 @@ void multipleLevel(const std::string& configPath)
 
 		return;
 
-		gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemManager);
+		gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemManager, communicator);
 		//gridGenerator = GridGenerator::make(gridBuilder, para);
 
 	}
diff --git a/apps/gpu/LBM/BaselMultiGPU/main.cpp b/apps/gpu/LBM/BaselMultiGPU/main.cpp
index 70bd37403a52ce68cfa23adf23ea3f7fcbe2805e..8134fffbacf337db3c8e8668146ae81c4ab2128d 100644
--- a/apps/gpu/LBM/BaselMultiGPU/main.cpp
+++ b/apps/gpu/LBM/BaselMultiGPU/main.cpp
@@ -197,7 +197,7 @@ void multipleLevel(const std::string& configPath)
 
 		return;
 
-		gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemManager);
+		gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemManager, communicator);
 		//gridGenerator = GridGenerator::make(gridBuilder, para);
 
     }
diff --git a/apps/gpu/LBM/BaselNU/main.cpp b/apps/gpu/LBM/BaselNU/main.cpp
index 0ad52a4bb1398f08fccca8d7b4dfd3736614b8db..cdf61e80f597c8a55b13fe073419ca52b434d44a 100644
--- a/apps/gpu/LBM/BaselNU/main.cpp
+++ b/apps/gpu/LBM/BaselNU/main.cpp
@@ -170,7 +170,7 @@ void multipleLevel(const std::string& configPath)
 
 		return;
 
-		gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemManager);
+		gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemManager, communicator);
 		//gridGenerator = GridGenerator::make(gridBuilder, para);
 
 	}
diff --git a/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp b/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
index c440bd14cf46ca8dae8013b5c0a480109924f7c4..eeb11dd6a2cce7e10c56a0348ec3624ba33ec8ba 100644
--- a/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
+++ b/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
@@ -211,7 +211,7 @@ void multipleLevel(const std::string& configPath)
 
     SPtr<CudaMemoryManager> cudaMemoryManager = CudaMemoryManager::make(para);
 
-    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
 
     SPtr<PlanarAverageProbe> planarAverageProbe = SPtr<PlanarAverageProbe>( new PlanarAverageProbe("planeProbe", para->getOutputPath(), tStartAveraging/dt, tStartTmpAveraging/dt, tAveraging/dt , tStartOutProbe/dt, tOutProbe/dt, 'z') );
     planarAverageProbe->addAllAvailableStatistics();
diff --git a/apps/gpu/LBM/DrivenCavity/DrivenCavity.cpp b/apps/gpu/LBM/DrivenCavity/DrivenCavity.cpp
index 9f31c9358620331dd45dd1ff1a32f29b677a6d13..b6de85daa9a88c8e92869df425fa3ee56233fbe2 100644
--- a/apps/gpu/LBM/DrivenCavity/DrivenCavity.cpp
+++ b/apps/gpu/LBM/DrivenCavity/DrivenCavity.cpp
@@ -220,7 +220,7 @@ void multipleLevel(const std::string& configPath)
 
         SPtr<CudaMemoryManager> cudaMemoryManager = CudaMemoryManager::make(para);
 
-        SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+        SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
 
         Simulation sim(communicator);
         SPtr<FileWriter> fileWriter = SPtr<FileWriter>(new FileWriter());
diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/CMakeLists.txt b/apps/gpu/LBM/DrivenCavityMultiGPU/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..51b8db1edf126ebe7e2f3d5808496121270433c5
--- /dev/null
+++ b/apps/gpu/LBM/DrivenCavityMultiGPU/CMakeLists.txt
@@ -0,0 +1,8 @@
+PROJECT(DrivenCavityMultiGPU LANGUAGES CUDA CXX)
+
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES DrivenCavityMultiGPU.cpp)
+
+set_source_files_properties(DrivenCavityMultiGPU.cpp PROPERTIES LANGUAGE CUDA)
+
+set_target_properties(DrivenCavityMultiGPU PROPERTIES 
+	CUDA_SEPARABLE_COMPILATION ON)
\ No newline at end of file
diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp b/apps/gpu/LBM/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b22a3a9e62c7cf2d59c1403a0bfd9edc42b66b8
--- /dev/null
+++ b/apps/gpu/LBM/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp
@@ -0,0 +1,606 @@
+
+#define _USE_MATH_DEFINES
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include "mpi.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "basics/Core/DataTypes.h"
+#include "basics/Core/VectorTypes.h"
+#include "basics/PointerDefinitions.h"
+
+#include "basics/Core/LbmOrGks.h"
+#include "basics/Core/Logger/Logger.h"
+#include "basics/Core/StringUtilities/StringUtil.h"
+#include "basics/config/ConfigurationFile.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "GridGenerator/grid/BoundaryConditions/Side.h"
+#include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
+#include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
+#include "GridGenerator/grid/GridFactory.h"
+
+#include "geometries/Cuboid/Cuboid.h"
+#include "geometries/TriangularMesh/TriangularMesh.h"
+
+#include "GridGenerator/io/GridVTKWriter/GridVTKWriter.h"
+#include "GridGenerator/io/STLReaderWriter/STLReader.h"
+#include "GridGenerator/io/STLReaderWriter/STLWriter.h"
+#include "GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "VirtualFluids_GPU/Communication/Communicator.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
+#include "VirtualFluids_GPU/LBM/Simulation.h"
+#include "VirtualFluids_GPU/Output/FileWriter.h"
+#include "VirtualFluids_GPU/Parameter/Parameter.h"
+
+#include "VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.h"
+#include "VirtualFluids_GPU/PreProcessor/PreProcessorFactory/PreProcessorFactoryImp.h"
+
+#include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "utilities/communication.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//          U s e r    s e t t i n g s
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//  Tesla 03
+// std::string outPath("E:/temp/DrivenCavityMultiGPUResults/");
+// std::string gridPath = "D:/STLs/DrivenCavity";
+// std::string simulationName("DrivenCavityMultiGPU");
+
+// Phoenix
+// std::string outPath("/work/y0078217/Results/DrivenCavityMultiGPUResults/");
+// std::string gridPath = "/work/y0078217/Grids/GridDrivenCavityMultiGPU/";
+// std::string simulationName("DrivenCavityMultiGPU");
+
+//  Aragorn
+std::string outPath("/workspaces/VirtualFluids_dev/output/DrivenCavity_Results/");
+std::string gridPath = "/workspaces/VirtualFluids_dev/output/DrivenCavity_Results/grid/";
+std::string simulationName("DrivenCavity");
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void multipleLevel(const std::string &configPath)
+{
+    logging::Logger::addStream(&std::cout);
+    logging::Logger::setDebugLevel(logging::Logger::Level::INFO_LOW);
+    logging::Logger::timeStamp(logging::Logger::ENABLE);
+    logging::Logger::enablePrintedRankNumbers(logging::Logger::ENABLE);
+
+    auto gridFactory = GridFactory::make();
+    gridFactory->setTriangularMeshDiscretizationMethod(TriangularMeshDiscretizationMethod::POINT_IN_OBJECT);
+    auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
+
+    vf::gpu::Communicator &communicator = vf::gpu::Communicator::getInstance();
+    vf::basics::ConfigurationFile config;
+    std::cout << configPath << std::endl;
+    config.load(configPath);
+    SPtr<Parameter> para = std::make_shared<Parameter>(config, communicator.getNummberOfProcess(), communicator.getPID());
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    bool useGridGenerator = true;
+    bool useLevels        = true;
+    // para->setUseStreams(useStreams);                  // set in config
+    // para->useReducedCommunicationAfterFtoC = true;    // set in config
+    para->setCalcTurbulenceIntensity(false);
+
+    if (para->getNumprocs() == 1) {
+        para->useReducedCommunicationAfterFtoC = false;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    const real L        = 1.0;
+    const real Re       = 1000.0; // 1000
+    const real velocity = 1.0;
+    const real dt       = (real)1.0e-3; // 0.5e-3;
+    const uint nx       = 64;
+    std::string simulationName("DrivenCavityChimMultiGPU");
+
+    // para->setTOut(10000);   // set in config
+    // para->setTEnd(10000);   // set in config
+
+    const real dxGrid      = L / real(nx);
+    const real velocityLB  = velocity * dt / dxGrid;       // LB units
+    const real vxLB        = velocityLB / (real)sqrt(2.0); // LB units
+    const real vyLB        = velocityLB / (real)sqrt(2.0); // LB units
+    const real viscosityLB = nx * velocityLB / Re;         // LB units
+
+    *logging::out << logging::Logger::INFO_HIGH << "velocity  [dx/dt] = " << velocityLB << " \n";
+    *logging::out << logging::Logger::INFO_HIGH << "viscosity [dx^2/dt] = " << viscosityLB << "\n";
+
+    para->setVelocity(velocityLB);
+    para->setViscosity(viscosityLB);
+    para->setVelocityRatio(velocity / velocityLB);
+    para->setDensityRatio((real)1.0); // correct value?
+
+    para->setInitialCondition([&](real coordX, real coordY, real coordZ, real &rho, real &vx, real &vy, real &vz) {
+        rho = (real)1.0;
+        vx  = (real)(coordX * velocityLB);
+        vy  = (real)(coordY * velocityLB);
+        vz  = (real)(coordZ * velocityLB);
+    });
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    para->setCalcDragLift(false);
+    para->setUseWale(false);
+
+    if (para->getOutputPath().size() == 0) {
+        para->setOutputPath(outPath);
+    }
+    para->setOutputPrefix(simulationName);
+    para->setFName(para->getOutputPath() + para->getOutputPrefix());
+    para->setPrintFiles(true);
+    std::cout << "Write result files to " << para->getFName() << std::endl;
+
+    if (useLevels)
+        para->setMaxLevel(2);
+    else
+        para->setMaxLevel(1);
+
+    // para->setMainKernel("CumulantK17CompChim");
+    para->setMainKernel("CumulantK17CompChimStream");
+    *logging::out << logging::Logger::INFO_HIGH << "Kernel: " << para->getMainKernel() << "\n";
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    if (useGridGenerator) {
+        const real xGridMin = -0.5 * L;
+        const real xGridMax = 0.5 * L;
+        const real yGridMin = -0.5 * L;
+        const real yGridMax = 0.5 * L;
+        const real zGridMin = -0.5 * L;
+        const real zGridMax = 0.5 * L;
+
+        Cuboid *level1 = nullptr;
+        if (useLevels)
+            level1 = new Cuboid(-0.25 * L, -0.25 * L, -0.25 * L, 0.25 * L, 0.25 * L, 0.25 * L);
+
+        if (para->getNumprocs() > 1) {
+
+            const uint generatePart = vf::gpu::Communicator::getInstance().getPID();
+            real overlap            = (real)8.0 * dxGrid;
+            gridBuilder->setNumberOfLayers(10, 8);
+
+            const real xSplit = 0.0;
+            const real ySplit = 0.0;
+            const real zSplit = 0.0;
+
+            if (communicator.getNummberOfProcess() == 2) {
+
+                if (generatePart == 0) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xGridMax, yGridMax, zSplit + overlap,
+                                               dxGrid);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zSplit - overlap, xGridMax, yGridMax, zGridMax,
+                                               dxGrid);
+                }
+
+                if (useLevels) {
+                    gridBuilder->addGrid(level1, 1);
+                }
+
+                if (generatePart == 0) {
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xGridMax, yGridMin, yGridMax, zGridMin, zSplit));
+                }
+                if (generatePart == 1) {
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xGridMax, yGridMin, yGridMax, zSplit, zGridMax));
+                }
+
+                gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+
+                if (generatePart == 0) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 1);
+                }
+
+                if (generatePart == 1) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 0);
+                }
+
+                gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+                //////////////////////////////////////////////////////////////////////////
+                if (generatePart == 0)
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
+                if (generatePart == 1)
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::MX, 0.0, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::PY, 0.0, 0.0, 0.0);
+                //////////////////////////////////////////////////////////////////////////
+            } else if (communicator.getNummberOfProcess() == 4) {
+
+                if (generatePart == 0) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xSplit + overlap, yGridMax,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, yGridMin, zGridMin, xGridMax, yGridMax,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zSplit - overlap, xSplit + overlap, yGridMax,
+                                               zGridMax, dxGrid);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, yGridMin, zSplit - overlap, xGridMax, yGridMax,
+                                               zGridMax, dxGrid);
+                }
+
+                if (useLevels) {
+                    gridBuilder->addGrid(level1, 1);
+                }
+
+                if (generatePart == 0)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, yGridMin, yGridMax, zGridMin, zSplit));
+                if (generatePart == 1)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, yGridMin, yGridMax, zGridMin, zSplit));
+                if (generatePart == 2)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, yGridMin, yGridMax, zSplit, zGridMax));
+                if (generatePart == 3)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, yGridMin, yGridMax, zSplit, zGridMax));
+
+                gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+
+                if (generatePart == 0) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 1);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 2);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 0);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 3);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 3);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 0);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 2);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 1);
+                }
+
+                gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+                //////////////////////////////////////////////////////////////////////////
+                if (generatePart == 0) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, 0.0, 0.0, 0.0);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::PY, 0.0, 0.0, 0.0);
+                if (generatePart == 3) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
+                }
+                //////////////////////////////////////////////////////////////////////////
+            } else if (communicator.getNummberOfProcess() == 8) {
+
+                if (generatePart == 0) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xSplit + overlap, ySplit + overlap,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->addCoarseGrid(xGridMin, ySplit - overlap, zGridMin, xSplit + overlap, yGridMax,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, yGridMin, zGridMin, xGridMax, ySplit + overlap,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, ySplit - overlap, zGridMin, xGridMax, yGridMax,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 4) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zSplit - overlap, xSplit + overlap, ySplit + overlap,
+                                               zGridMax, dxGrid);
+                }
+                if (generatePart == 5) {
+                    gridBuilder->addCoarseGrid(xGridMin, ySplit - overlap, zSplit - overlap, xSplit + overlap, yGridMax,
+                                               zGridMax, dxGrid);
+                }
+                if (generatePart == 6) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, yGridMin, zSplit - overlap, xGridMax, ySplit + overlap,
+                                               zGridMax, dxGrid);
+                }
+                if (generatePart == 7) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, ySplit - overlap, zSplit - overlap, xGridMax, yGridMax,
+                                               zGridMax, dxGrid);
+                }
+
+                if (useLevels) {
+                    gridBuilder->addGrid(level1, 1);
+                }
+
+                if (generatePart == 0)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, yGridMin, ySplit, zGridMin, zSplit));
+                if (generatePart == 1)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, ySplit, yGridMax, zGridMin, zSplit));
+                if (generatePart == 2)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, yGridMin, ySplit, zGridMin, zSplit));
+                if (generatePart == 3)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, ySplit, yGridMax, zGridMin, zSplit));
+                if (generatePart == 4)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, yGridMin, ySplit, zSplit, zGridMax));
+                if (generatePart == 5)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, ySplit, yGridMax, zSplit, zGridMax));
+                if (generatePart == 6)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, yGridMin, ySplit, zSplit, zGridMax));
+                if (generatePart == 7)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, ySplit, yGridMax, zSplit, zGridMax));
+
+                gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+                gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+
+                if (generatePart == 0) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 1);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 2);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 4);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 0);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 3);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 5);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 3);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 0);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 6);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 2);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 1);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 7);
+                }
+                if (generatePart == 4) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 5);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 6);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 0);
+                }
+                if (generatePart == 5) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 4);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 7);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 1);
+                }
+                if (generatePart == 6) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 7);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 4);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 2);
+                }
+                if (generatePart == 7) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 6);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 5);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 3);
+                }
+
+                //////////////////////////////////////////////////////////////////////////
+                if (generatePart == 0) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
+                }
+                if (generatePart == 4) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 5) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 6) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 7) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                //////////////////////////////////////////////////////////////////////////
+            }
+            if (para->getKernelNeedsFluidNodeIndicesToRun())
+                gridBuilder->findFluidNodes(para->getUseStreams());
+
+            // gridBuilder->writeGridsToVtk(outPath +  "/grid/part" + std::to_string(generatePart) + "_");
+            // gridBuilder->writeArrows(outPath + "/" + std::to_string(generatePart) + " /arrow");
+
+            SimulationFileWriter::write(gridPath + std::to_string(generatePart) + "/", gridBuilder, FILEFORMAT::BINARY);
+        } else {
+
+            gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xGridMax, yGridMax, zGridMax, dxGrid);
+
+            if (useLevels) {
+                gridBuilder->setNumberOfLayers(10, 8);
+                gridBuilder->addGrid(level1, 1);
+            }
+
+            gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+            gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+            //////////////////////////////////////////////////////////////////////////
+            gridBuilder->setVelocityBoundaryCondition(SideType::MX, 0.0, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::PX, 0.0, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::PY, 0.0, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+
+            //////////////////////////////////////////////////////////////////////////
+            if (para->getKernelNeedsFluidNodeIndicesToRun())
+                gridBuilder->findFluidNodes(para->getUseStreams());
+
+            gridBuilder->writeGridsToVtk(outPath + "/grid/");
+            // gridBuilder->writeArrows(outPath + "/arrow");
+
+            SimulationFileWriter::write(gridPath, gridBuilder, FILEFORMAT::BINARY);
+        }
+    }
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    SPtr<CudaMemoryManager> cudaMemoryManager = CudaMemoryManager::make(para);
+
+    SPtr<GridProvider> gridGenerator;
+    if (useGridGenerator)
+        gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
+    else {
+        gridGenerator = GridProvider::makeGridReader(FILEFORMAT::BINARY, para, cudaMemoryManager);
+    }
+
+    Simulation sim(communicator);
+    SPtr<FileWriter> fileWriter                      = SPtr<FileWriter>(new FileWriter());
+    SPtr<KernelFactoryImp> kernelFactory             = KernelFactoryImp::getInstance();
+    SPtr<PreProcessorFactoryImp> preProcessorFactory = PreProcessorFactoryImp::getInstance();
+    sim.setFactories(kernelFactory, preProcessorFactory);
+    sim.init(para, gridGenerator, fileWriter, cudaMemoryManager);
+    sim.run();
+    sim.free();
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+}
+
+int main(int argc, char *argv[])
+{
+    MPI_Init(&argc, &argv);
+    std::string str, str2, configFile;
+
+    if (argv != NULL) {
+
+        try {
+            //////////////////////////////////////////////////////////////////////////
+
+            std::string targetPath;
+
+            targetPath = __FILE__;
+
+            if (argc == 2) {
+                configFile = argv[1];
+                std::cout << "Using configFile command line argument: " << configFile << std::endl;
+            }
+
+#ifdef _WIN32
+            targetPath = targetPath.substr(0, targetPath.find_last_of('\\') + 1);
+#else
+            targetPath = targetPath.substr(0, targetPath.find_last_of('/') + 1);
+#endif
+
+            std::cout << targetPath << std::endl;
+
+            if (configFile.size() == 0) {
+                configFile = targetPath + "configDrivenCavityMultiGPU.txt";
+            }
+
+            multipleLevel(configFile);
+
+            //////////////////////////////////////////////////////////////////////////
+        } catch (const std::bad_alloc &e) {
+            *logging::out << logging::Logger::LOGGER_ERROR << "Bad Alloc:" << e.what() << "\n";
+        } catch (const std::exception &e) {
+            *logging::out << logging::Logger::LOGGER_ERROR << e.what() << "\n";
+        } catch (...) {
+            *logging::out << logging::Logger::LOGGER_ERROR << "Unknown exception!\n";
+        }
+    }
+
+    MPI_Finalize();
+    return 0;
+}
\ No newline at end of file
diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/configDrivenCavityMultiGPU.txt b/apps/gpu/LBM/DrivenCavityMultiGPU/configDrivenCavityMultiGPU.txt
new file mode 100644
index 0000000000000000000000000000000000000000..97f5b5d8079e6b937863cca09e6076e0a753af23
--- /dev/null
+++ b/apps/gpu/LBM/DrivenCavityMultiGPU/configDrivenCavityMultiGPU.txt
@@ -0,0 +1,36 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0"
+NumberOfDevices=1
+
+##################################################
+#informations for Writing
+##################################################
+Path=/workspaces/VirtualFluids_dev/output/DrivenCavity_Results/  # Aragorn
+#Prefix="DrivenCavity"
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/workspaces/VirtualFluids_dev/output/DrivenCavity_Results/grid/  # Aragorn
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=1000
+TimeOut=100
+#TimeStartOut=0
\ No newline at end of file
diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix1GPU.txt b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix1GPU.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e63db13b533d24ef44b2e4d472ffba481d79f828
--- /dev/null
+++ b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix1GPU.txt
@@ -0,0 +1,35 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0"
+NumberOfDevices=1
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/DrivenCavityMultiGPUResults/1GPU/
+#Prefix="DrivenCavityMultiGPU" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridDrivenCavityMultiGPU/1GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=1000
+TimeOut=1000
+#TimeStartOut=0
\ No newline at end of file
diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix2GPU.txt b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix2GPU.txt
new file mode 100644
index 0000000000000000000000000000000000000000..30b1882df34dcae63674b42ce6d65a47942ff87e
--- /dev/null
+++ b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix2GPU.txt
@@ -0,0 +1,41 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1"
+NumberOfDevices=2
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/DrivenCavityMultiGPUResults/2GPU/
+#Prefix="DrivenCavityMultiGPU" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridDrivenCavityMultiGPU/2GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=1
+TimeOut=1
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = true
+useReducedCommunicationInInterpolation = true
\ No newline at end of file
diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU.txt b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c710922b9fc82ac7680f5f7daade4faa235bc957
--- /dev/null
+++ b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU.txt
@@ -0,0 +1,41 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/DrivenCavityMultiGPUResults/4GPU/
+#Prefix="DrivenCavityMultiGPU" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridDrivenCavityMultiGPU/4GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=1
+TimeOut=1
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = true
+useReducedCommunicationInInterpolation = true
\ No newline at end of file
diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix8GPU.txt b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix8GPU.txt
new file mode 100644
index 0000000000000000000000000000000000000000..85684e7589dad91e53356c16bc2eae44081d7e96
--- /dev/null
+++ b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix8GPU.txt
@@ -0,0 +1,41 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/DrivenCavityMultiGPUResults/8GPU/
+#Prefix="DrivenCavityMultiGPU" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridDrivenCavityMultiGPU/8GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=1000
+TimeOut=1000
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = true
+useReducedCommunicationInInterpolation = true
\ No newline at end of file
diff --git a/apps/gpu/LBM/MusselOyster/CMakeLists.txt b/apps/gpu/LBM/MusselOyster/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..595d9ff7250d984f80e8d0d54dad0b11ae7e71e2
--- /dev/null
+++ b/apps/gpu/LBM/MusselOyster/CMakeLists.txt
@@ -0,0 +1,8 @@
+PROJECT(MusselOyster LANGUAGES CUDA CXX)
+
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES MusselOyster.cpp)
+
+set_source_files_properties(MusselOyster.cpp PROPERTIES LANGUAGE CUDA)
+
+set_target_properties(MusselOyster PROPERTIES 
+	CUDA_SEPARABLE_COMPILATION ON)
\ No newline at end of file
diff --git a/apps/gpu/LBM/MusselOyster/MusselOyster.cpp b/apps/gpu/LBM/MusselOyster/MusselOyster.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e43472e8f3c817f10c12266b88948798a2bb676
--- /dev/null
+++ b/apps/gpu/LBM/MusselOyster/MusselOyster.cpp
@@ -0,0 +1,638 @@
+#define _USE_MATH_DEFINES
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include "mpi.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "basics/Core/DataTypes.h"
+#include "basics/Core/VectorTypes.h"
+#include "basics/PointerDefinitions.h"
+
+#include "basics/Core/LbmOrGks.h"
+#include "basics/Core/Logger/Logger.h"
+#include "basics/Core/StringUtilities/StringUtil.h"
+#include "basics/config/ConfigurationFile.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "GridGenerator/grid/BoundaryConditions/Side.h"
+#include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
+#include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
+#include "GridGenerator/grid/GridFactory.h"
+
+#include "geometries/Sphere/Sphere.h"
+#include "geometries/TriangularMesh/TriangularMesh.h"
+
+#include "GridGenerator/io/GridVTKWriter/GridVTKWriter.h"
+#include "GridGenerator/io/STLReaderWriter/STLReader.h"
+#include "GridGenerator/io/STLReaderWriter/STLWriter.h"
+#include "GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "VirtualFluids_GPU/Communication/Communicator.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
+#include "VirtualFluids_GPU/LBM/Simulation.h"
+#include "VirtualFluids_GPU/Output/FileWriter.h"
+#include "VirtualFluids_GPU/Parameter/Parameter.h"
+
+#include "VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.h"
+#include "VirtualFluids_GPU/PreProcessor/PreProcessorFactory/PreProcessorFactoryImp.h"
+
+#include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "utilities/communication.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//          U s e r    s e t t i n g s
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Tesla 03
+// std::string outPath("E:/temp/MusselOysterResults/");
+// std::string gridPathParent = "E:/temp/GridMussel/";
+// std::string stlPath("C:/Users/Master/Documents/MasterAnna/STL/");
+// std::string simulationName("MusselOyster");
+
+// Aragorn
+// std::string outPath("/workspaces/VirtualFluids_dev/output/MusselOysterResults/");
+// std::string gridPathParent = "/workspaces/VirtualFluids_dev/output/MusselOysterResults/grid/";
+// std::string stlPath("/workspaces/VirtualFluids_dev/stl/MusselOyster/");
+// std::string simulationName("MusselOyster");
+
+// Phoenix
+std::string outPath("/work/y0078217/Results/MusselOysterResults/");
+std::string gridPathParent = "/work/y0078217/Grids/GridMusselOyster/";
+std::string stlPath("/home/y0078217/STL/MusselOyster/");
+std::string simulationName("MusselOyster");
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void multipleLevel(const std::string &configPath)
+{
+    logging::Logger::addStream(&std::cout);
+    logging::Logger::setDebugLevel(logging::Logger::Level::INFO_LOW);
+    logging::Logger::timeStamp(logging::Logger::ENABLE);
+    logging::Logger::enablePrintedRankNumbers(logging::Logger::ENABLE);
+
+    auto gridFactory = GridFactory::make();
+    gridFactory->setTriangularMeshDiscretizationMethod(TriangularMeshDiscretizationMethod::POINT_IN_OBJECT);
+    auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
+
+    vf::gpu::Communicator &communicator = vf::gpu::Communicator::getInstance();
+    vf::basics::ConfigurationFile config;
+    std::cout << configPath << std::endl;
+    config.load(configPath);
+    SPtr<Parameter> para = std::make_shared<Parameter>(config, communicator.getNummberOfProcess(), communicator.getPID());
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    bool useGridGenerator                  = true;
+    bool useStreams                        = true;
+    bool useLevels                         = true;
+    para->useReducedCommunicationAfterFtoC = true;
+    para->setCalcTurbulenceIntensity(true);
+
+    if (para->getNumprocs() == 1) {
+        para->useReducedCommunicationAfterFtoC = false;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    std::string bivalveType = "MUSSEL"; // "MUSSEL" "OYSTER"
+    std::string gridPath(
+        gridPathParent +
+        bivalveType); // only for GridGenerator, for GridReader the gridPath needs to be set in the config file
+
+    // real dxGrid = (real)2.0; // 2.0
+    real dxGrid = (real)1.0; // 1.0
+    if (para->getNumprocs() == 8)
+        dxGrid = 0.5;
+    real vxLB            = (real)0.051; // LB units
+    real Re              = (real)300.0;
+    real referenceLength = 1.0 / dxGrid; // heightBivalve / dxGrid
+    real viscosityLB     = (vxLB * referenceLength) / Re;
+
+    para->setVelocity(vxLB);
+    para->setViscosity(viscosityLB);
+    para->setVelocityRatio((real)58.82352941);
+    para->setViscosityRatio((real)0.058823529);
+    para->setDensityRatio((real)998.0);
+
+    *logging::out << logging::Logger::INFO_HIGH << "bivalveType = " << bivalveType << " \n";
+    *logging::out << logging::Logger::INFO_HIGH << "velocity LB [dx/dt] = " << vxLB << " \n";
+    *logging::out << logging::Logger::INFO_HIGH << "viscosity LB [dx^2/dt] = " << viscosityLB << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "velocity real [m/s] = " << vxLB * para->getVelocityRatio() << " \n";
+    *logging::out << logging::Logger::INFO_HIGH
+                  << "viscosity real [m^2/s] = " << viscosityLB * para->getViscosityRatio() << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "dxGrid = " << dxGrid << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "useGridGenerator = " << useGridGenerator << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "useStreams = " << useStreams << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "number of processes = " << para->getNumprocs() << "\n";
+
+    // para->setTOut(1000);
+    // para->setTEnd(10000);
+
+    para->setCalcDragLift(false);
+    para->setUseWale(false);
+
+    if (para->getOutputPath().size() == 0) {
+        para->setOutputPath(outPath);
+    }
+    para->setOutputPrefix(simulationName);
+    para->setFName(para->getOutputPath() + para->getOutputPrefix());
+    para->setPrintFiles(true);
+    std::cout << "Write result files to " << para->getFName() << std::endl;
+
+    if (useLevels)
+        para->setMaxLevel(2);
+    else
+        para->setMaxLevel(1);
+
+    para->setUseStreams(useStreams);
+    // para->setMainKernel("CumulantK17CompChim");
+    para->setMainKernel("CumulantK17CompChimStream");
+    *logging::out << logging::Logger::INFO_HIGH << "Kernel: " << para->getMainKernel() << "\n";
+
+    //////////////////////////////////////////////////////////////////////////
+
+    if (useGridGenerator) {
+        const real xGridMin = -100.0; // -100.0;
+        const real xGridMax = 470.0;  // alt 540.0 // neu 440 // mit groesserem Level 1 470
+        const real yGridMin = 1.0;    // 1.0;
+        const real yGridMax = 350.0;  // alt 440.0; // neu 350
+        const real zGridMin = -85;    // -85;
+        const real zGridMax = 85.0;   // 85;
+
+        // height MUSSEL = 35.0
+        // height Oyster = 72.0
+
+        TriangularMesh *bivalveSTL       = TriangularMesh::make(stlPath + bivalveType + ".stl");
+        TriangularMesh *bivalveRef_1_STL = nullptr;
+        if (useLevels)
+            bivalveRef_1_STL = TriangularMesh::make(stlPath + bivalveType + "_Level1.stl");
+
+        if (para->getNumprocs() > 1) {
+            const uint generatePart = vf::gpu::Communicator::getInstance().getPID();
+
+            real overlap = (real)8.0 * dxGrid;
+            gridBuilder->setNumberOfLayers(10, 8);
+
+            if (communicator.getNummberOfProcess() == 2) {
+                const real zSplit = 0.0; // round(((double)bbzp + bbzm) * 0.5);
+
+                if (generatePart == 0) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xGridMax, yGridMax, zSplit + overlap,
+                                               dxGrid);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zSplit - overlap, xGridMax, yGridMax, zGridMax,
+                                               dxGrid);
+                }
+
+                if (useLevels) {
+                    gridBuilder->addGrid(bivalveRef_1_STL, 1);
+                }
+
+                gridBuilder->addGeometry(bivalveSTL);
+
+                if (generatePart == 0) {
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xGridMax, yGridMin, yGridMax, zGridMin, zSplit));
+                }
+                if (generatePart == 1) {
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xGridMax, yGridMin, yGridMax, zSplit, zGridMax));
+                }
+
+                gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+
+                if (generatePart == 0) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 1);
+                }
+
+                if (generatePart == 1) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 0);
+                }
+
+                gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+                //////////////////////////////////////////////////////////////////////////
+                if (generatePart == 0)
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                if (generatePart == 1)
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
+                gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+                //////////////////////////////////////////////////////////////////////////
+            } else if (communicator.getNummberOfProcess() == 4) {
+
+                const real xSplit = 100.0;
+                const real zSplit = 0.0;
+
+                if (generatePart == 0) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xSplit + overlap, yGridMax,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, yGridMin, zGridMin, xGridMax, yGridMax,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zSplit - overlap, xSplit + overlap, yGridMax,
+                                               zGridMax, dxGrid);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, yGridMin, zSplit - overlap, xGridMax, yGridMax,
+                                               zGridMax, dxGrid);
+                }
+
+                if (useLevels) {
+                    gridBuilder->addGrid(bivalveRef_1_STL, 1);
+                }
+
+                gridBuilder->addGeometry(bivalveSTL);
+
+                if (generatePart == 0)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, yGridMin, yGridMax, zGridMin, zSplit));
+                if (generatePart == 1)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, yGridMin, yGridMax, zGridMin, zSplit));
+                if (generatePart == 2)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, yGridMin, yGridMax, zSplit, zGridMax));
+                if (generatePart == 3)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, yGridMin, yGridMax, zSplit, zGridMax));
+
+                gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+
+                if (generatePart == 0) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 1);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 2);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 0);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 3);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 3);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 0);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 2);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 1);
+                }
+
+                gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+                //////////////////////////////////////////////////////////////////////////
+                if (generatePart == 0) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                if (generatePart == 3) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+                }
+                if (generatePart == 1) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+                }
+                //////////////////////////////////////////////////////////////////////////
+            } else if (communicator.getNummberOfProcess() == 8) {
+                real xSplit = 140.0; // 100.0 // mit groesserem Level 1 140.0
+                real ySplit = 32.0;  // 32.0
+                real zSplit = 0.0;
+
+                if (generatePart == 0) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xSplit + overlap, ySplit + overlap,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->addCoarseGrid(xGridMin, ySplit - overlap, zGridMin, xSplit + overlap, yGridMax,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, yGridMin, zGridMin, xGridMax, ySplit + overlap,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, ySplit - overlap, zGridMin, xGridMax, yGridMax,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 4) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zSplit - overlap, xSplit + overlap, ySplit + overlap,
+                                               zGridMax, dxGrid);
+                }
+                if (generatePart == 5) {
+                    gridBuilder->addCoarseGrid(xGridMin, ySplit - overlap, zSplit - overlap, xSplit + overlap, yGridMax,
+                                               zGridMax, dxGrid);
+                }
+                if (generatePart == 6) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, yGridMin, zSplit - overlap, xGridMax, ySplit + overlap,
+                                               zGridMax, dxGrid);
+                }
+                if (generatePart == 7) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, ySplit - overlap, zSplit - overlap, xGridMax, yGridMax,
+                                               zGridMax, dxGrid);
+                }
+
+                if (useLevels) {
+                    gridBuilder->addGrid(bivalveRef_1_STL, 1);
+                }
+
+                gridBuilder->addGeometry(bivalveSTL);
+
+                if (generatePart == 0)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, yGridMin, ySplit, zGridMin, zSplit));
+                if (generatePart == 1)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, ySplit, yGridMax, zGridMin, zSplit));
+                if (generatePart == 2)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, yGridMin, ySplit, zGridMin, zSplit));
+                if (generatePart == 3)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, ySplit, yGridMax, zGridMin, zSplit));
+                if (generatePart == 4)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, yGridMin, ySplit, zSplit, zGridMax));
+                if (generatePart == 5)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, ySplit, yGridMax, zSplit, zGridMax));
+                if (generatePart == 6)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, yGridMin, ySplit, zSplit, zGridMax));
+                if (generatePart == 7)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, ySplit, yGridMax, zSplit, zGridMax));
+
+                gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+                gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+
+                if (generatePart == 0) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 1);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 2);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 4);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 0);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 3);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 5);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 3);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 0);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 6);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 2);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 1);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 7);
+                }
+                if (generatePart == 4) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 5);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 6);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 0);
+                }
+                if (generatePart == 5) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 4);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 7);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 1);
+                }
+                if (generatePart == 6) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 7);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 4);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 2);
+                }
+                if (generatePart == 7) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 6);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 5);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 3);
+                }
+
+                //////////////////////////////////////////////////////////////////////////
+                if (generatePart == 0) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+                }
+                if (generatePart == 3) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+                }
+                if (generatePart == 4) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 5) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 6) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+                }
+                if (generatePart == 7) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+                }
+                // gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
+                //////////////////////////////////////////////////////////////////////////
+            }
+            if (para->getKernelNeedsFluidNodeIndicesToRun())
+                gridBuilder->findFluidNodes(useStreams);
+
+            // gridBuilder->writeGridsToVtk(outPath +  bivalveType + "/grid/part" + std::to_string(generatePart) + "_");
+            // gridBuilder->writeArrows(outPath + bivalveType + "/" + std::to_string(generatePart) + " /arrow");
+            // SimulationFileWriter::write(gridPath + std::to_string(generatePart) + "/", gridBuilder,
+            //                             FILEFORMAT::BINARY);
+        } else {
+
+            gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xGridMax, yGridMax, zGridMax, dxGrid);
+
+            if (useLevels) {
+                gridBuilder->setNumberOfLayers(10, 8);
+                gridBuilder->addGrid(bivalveRef_1_STL, 1);
+            }
+
+            gridBuilder->addGeometry(bivalveSTL);
+
+            gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+
+            gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+            //////////////////////////////////////////////////////////////////////////
+            gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::MY, 0.0, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
+            gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+
+            //////////////////////////////////////////////////////////////////////////
+            if (para->getKernelNeedsFluidNodeIndicesToRun())
+                gridBuilder->findFluidNodes(useStreams);
+
+            // gridBuilder->writeGridsToVtk(outPath +  bivalveType + "/grid/");
+            // gridBuilder->writeArrows ((outPath + bivalveType + "/arrow");
+
+            SimulationFileWriter::write(gridPath, gridBuilder, FILEFORMAT::BINARY);
+        }
+    }
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    SPtr<CudaMemoryManager> cudaMemoryManager = CudaMemoryManager::make(para);
+
+    SPtr<GridProvider> gridGenerator;
+    if (useGridGenerator)
+        gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
+    else {
+        gridGenerator = GridProvider::makeGridReader(FILEFORMAT::BINARY, para, cudaMemoryManager);
+    }
+
+    Simulation sim(communicator);
+    SPtr<FileWriter> fileWriter                      = SPtr<FileWriter>(new FileWriter());
+    SPtr<KernelFactoryImp> kernelFactory             = KernelFactoryImp::getInstance();
+    SPtr<PreProcessorFactoryImp> preProcessorFactory = PreProcessorFactoryImp::getInstance();
+    sim.setFactories(kernelFactory, preProcessorFactory);
+    sim.init(para, gridGenerator, fileWriter, cudaMemoryManager);
+    sim.run();
+    sim.free();
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+}
+
+int main(int argc, char *argv[])
+{
+    MPI_Init(&argc, &argv);
+    std::string str, str2, configFile;
+
+    if (argv != NULL) {
+        try {
+            //////////////////////////////////////////////////////////////////////////
+
+            std::string targetPath;
+
+            targetPath = __FILE__;
+
+            if (argc == 2) {
+                configFile = argv[1];
+                std::cout << "Using configFile command line argument: " << configFile << std::endl;
+            }
+
+#ifdef _WIN32
+            targetPath = targetPath.substr(0, targetPath.find_last_of('\\') + 1);
+#else
+            targetPath = targetPath.substr(0, targetPath.find_last_of('/') + 1);
+#endif
+
+            std::cout << targetPath << std::endl;
+
+            if (configFile.size() == 0) {
+                configFile = targetPath + "configMusselOyster.txt";
+            }
+
+            multipleLevel(configFile);
+
+            //////////////////////////////////////////////////////////////////////////
+        } catch (const std::bad_alloc &e) {
+            *logging::out << logging::Logger::LOGGER_ERROR << "Bad Alloc:" << e.what() << "\n";
+        } catch (const std::exception &e) {
+            *logging::out << logging::Logger::LOGGER_ERROR << e.what() << "\n";
+        } catch (...) {
+            *logging::out << logging::Logger::LOGGER_ERROR << "Unknown exception!\n";
+        }
+    }
+
+    MPI_Finalize();
+    return 0;
+}
\ No newline at end of file
diff --git a/apps/gpu/LBM/MusselOyster/configMusselOyster.txt b/apps/gpu/LBM/MusselOyster/configMusselOyster.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0978e65f0500371a54af99e319ae6c53572d93c0
--- /dev/null
+++ b/apps/gpu/LBM/MusselOyster/configMusselOyster.txt
@@ -0,0 +1,50 @@
+# Tesla 03
+# mpiexec -n 2 "C:/Users/Master/Documents/MasterAnna/VirtualFluids_dev/build/bin/Release/MusselOyster.exe" "C:/Users/Master/Documents/MasterAnna/VirtualFluids_dev/apps/gpu/LBM/MusselOyster/configMusselOyster.txt"
+# Phoenix
+# mpirun -np 2 "./VirtualFluids_dev/build/bin/MusselOyster" "./VirtualFluids_dev/apps/gpu/LBM/MusselOyster/configMusselOyster.txt"
+
+# Phoenix mpich
+# mpirun -np 4 nvprof -f -o MusselOyster.%q{PMI_RANK}.nvprof "./VirtualFluids_dev/build/bin/MusselOyster" "./VirtualFluids_dev/apps/gpu/LBM/SphereScaling/configPhoenix4GPU.txt"
+# Phoenix openmpi
+# mpirun -np 4 nvprof -f -o MusselOyster.%q{OMPI_COMM_WORLD_RANK}.nvprof "./VirtualFluids_dev/build/bin/MusselOyster" "./VirtualFluids_dev/apps/gpu/LBM/SphereScaling/configPhoenix4GPU.txt"
+
+# Aragorn
+ ./bin/MusselOyster "../apps/gpu/LBM/MusselOyster/configMusselOyster.txt"
+
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=2
+
+##################################################
+#informations for Writing
+##################################################
+#Path=/work/y0078217/Results/MusselOysterResults/
+Path=/workspaces/VirtualFluids_dev/output/MusselOysterResults/
+#Prefix="MusselOyster" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+#GridPath=E:/work/y0078217/Grids/GridMusselOyster/
+GridPath=/workspaces/VirtualFluids_dev/output/MusselOysterResults/grid/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=10000
+TimeOut=10000 
+#TimeStartOut=0
\ No newline at end of file
diff --git a/apps/gpu/LBM/MusselOyster/configPhoenix1GPU.txt b/apps/gpu/LBM/MusselOyster/configPhoenix1GPU.txt
new file mode 100644
index 0000000000000000000000000000000000000000..369d68be327f72ef3762a2ddaf31ff89b84e06c7
--- /dev/null
+++ b/apps/gpu/LBM/MusselOyster/configPhoenix1GPU.txt
@@ -0,0 +1,36 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0"
+NumberOfDevices=1
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/MusselOysterResults/1GPUMussel1/
+#Path="F:/Work/Computations/out/MusselOyster/"
+#Prefix="MusselOyster" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridMusselOyster/Mussel1GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=10000 #400000 / 200000
+TimeOut=5000 #200000 / 100000
+#TimeStartOut=0
\ No newline at end of file
diff --git a/apps/gpu/LBM/MusselOyster/configPhoenix8GPU.txt b/apps/gpu/LBM/MusselOyster/configPhoenix8GPU.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e2b0c91482b6a650ff28a210673cac097cb8c2d
--- /dev/null
+++ b/apps/gpu/LBM/MusselOyster/configPhoenix8GPU.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/MusselOysterResults/8GPUOyster05/
+#Path="F:/Work/Computations/out/MusselOyster/"
+#Prefix="MusselOyster" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridMusselOyster/Oyster8GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=400000 # 800000
+TimeOut=100000 # 400000
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = true
+useReducedCommunicationInInterpolation = true
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/CMakeLists.txt b/apps/gpu/LBM/SphereScaling/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..db3747f2b620cab1efc5cf50f02aee1a8fee4a54
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/CMakeLists.txt
@@ -0,0 +1,8 @@
+PROJECT(SphereScaling LANGUAGES CUDA CXX)
+
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES SphereScaling.cpp)
+
+set_source_files_properties(SphereScaling.cpp PROPERTIES LANGUAGE CUDA)
+
+set_target_properties(SphereScaling PROPERTIES 
+	CUDA_SEPARABLE_COMPILATION ON)
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/SphereScaling.cpp b/apps/gpu/LBM/SphereScaling/SphereScaling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7bd363310d3c3b3d4ab60559919b2253dfc1f3ba
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/SphereScaling.cpp
@@ -0,0 +1,739 @@
+
+#define _USE_MATH_DEFINES
+#include <exception>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include "mpi.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "basics/Core/DataTypes.h"
+#include "basics/Core/VectorTypes.h"
+#include "basics/PointerDefinitions.h"
+
+#include "basics/Core/LbmOrGks.h"
+#include "basics/Core/Logger/Logger.h"
+#include "basics/Core/StringUtilities/StringUtil.h"
+#include "basics/config/ConfigurationFile.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "GridGenerator/grid/BoundaryConditions/Side.h"
+#include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
+#include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
+#include "GridGenerator/grid/GridFactory.h"
+
+#include "geometries/Conglomerate/Conglomerate.h"
+#include "geometries/Cuboid/Cuboid.h"
+#include "geometries/Sphere/Sphere.h"
+#include "geometries/TriangularMesh/TriangularMesh.h"
+
+#include "GridGenerator/io/GridVTKWriter/GridVTKWriter.h"
+#include "GridGenerator/io/STLReaderWriter/STLReader.h"
+#include "GridGenerator/io/STLReaderWriter/STLWriter.h"
+#include "GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "VirtualFluids_GPU/Communication/Communicator.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
+#include "VirtualFluids_GPU/LBM/Simulation.h"
+#include "VirtualFluids_GPU/Output/FileWriter.h"
+#include "VirtualFluids_GPU/Parameter/Parameter.h"
+
+#include "VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.h"
+#include "VirtualFluids_GPU/PreProcessor/PreProcessorFactory/PreProcessorFactoryImp.h"
+
+#include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "utilities/communication.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//          U s e r    s e t t i n g s
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//  Tesla 03
+//  std::string outPath("E:/temp/SphereScalingResults/");
+//  std::string gridPathParent = "E:/temp/GridSphereScaling/";
+//  std::string simulationName("SphereScaling");
+// std::string stlPath("C:/Users/Master/Documents/MasterAnna/STL/Sphere/");
+
+// Phoenix
+std::string outPath("/work/y0078217/Results/SphereScalingResults/");
+std::string gridPathParent = "/work/y0078217/Grids/GridSphereScaling/";
+std::string simulationName("SphereScaling");
+std::string stlPath("/home/y0078217/STL/Sphere/");
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void multipleLevel(const std::string &configPath)
+{
+    logging::Logger::addStream(&std::cout);
+    logging::Logger::setDebugLevel(logging::Logger::Level::INFO_LOW);
+    logging::Logger::timeStamp(logging::Logger::ENABLE);
+    logging::Logger::enablePrintedRankNumbers(logging::Logger::ENABLE);
+
+    auto gridFactory = GridFactory::make();
+    gridFactory->setTriangularMeshDiscretizationMethod(TriangularMeshDiscretizationMethod::POINT_IN_OBJECT);
+    auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
+
+    vf::gpu::Communicator &communicator = vf::gpu::Communicator::getInstance();
+    vf::basics::ConfigurationFile config;
+    std::cout << configPath << std::endl;
+    config.load(configPath);
+    SPtr<Parameter> para = std::make_shared<Parameter>(config, communicator.getNummberOfProcess(), communicator.getPID());
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    bool useGridGenerator   = true;
+    bool useLevels          = true;
+    std::string scalingType = "strong"; // "strong" // "weak"
+    // para->setUseStreams(true);                        // set in config
+    // para->useReducedCommunicationAfterFtoC = true;    // set in config
+
+    if (para->getNumprocs() == 1) {
+        para->useReducedCommunicationAfterFtoC = false;
+    }
+    if (scalingType != "weak" && scalingType != "strong")
+        std::cerr << "unknown scaling type" << std::endl;
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    std::string gridPath(
+        gridPathParent); // only for GridGenerator, for GridReader the gridPath needs to be set in the config file
+
+    real dxGrid      = (real)0.2;
+    real vxLB        = (real)0.0005; // LB units
+    real viscosityLB = 0.001;        //(vxLB * dxGrid) / Re;
+
+    para->setVelocity(vxLB);
+    para->setViscosity(viscosityLB);
+    para->setVelocityRatio((real)58.82352941);
+    para->setViscosityRatio((real)0.058823529);
+    para->setDensityRatio((real)998.0);
+
+    *logging::out << logging::Logger::INFO_HIGH << "velocity LB [dx/dt] = " << vxLB << " \n";
+    *logging::out << logging::Logger::INFO_HIGH << "viscosity LB [dx^2/dt] = " << viscosityLB << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "velocity real [m/s] = " << vxLB * para->getVelocityRatio() << " \n";
+    *logging::out << logging::Logger::INFO_HIGH
+                  << "viscosity real [m^2/s] = " << viscosityLB * para->getViscosityRatio() << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "dxGrid = " << dxGrid << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "useGridGenerator = " << useGridGenerator << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "useStreams = " << para->getUseStreams() << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "number of processes = " << para->getNumprocs() << "\n";
+    *logging::out << logging::Logger::INFO_HIGH
+                  << "para->useReducedCommunicationAfterFtoC = " << para->useReducedCommunicationAfterFtoC << "\n";
+    *logging::out << logging::Logger::INFO_HIGH << "scalingType = " << scalingType << "\n";
+
+    // para->setTOut(10);
+    // para->setTEnd(10);
+
+    para->setCalcDragLift(false);
+    para->setUseWale(false);
+
+    if (para->getOutputPath().size() == 0) {
+        para->setOutputPath(outPath);
+    }
+    para->setOutputPrefix(simulationName);
+    para->setFName(para->getOutputPath() + para->getOutputPrefix());
+    para->setPrintFiles(true);
+    std::cout << "Write result files to " << para->getFName() << std::endl;
+
+    if (useLevels)
+        para->setMaxLevel(2);
+    else
+        para->setMaxLevel(1);
+
+    // para->setMainKernel("CumulantK17CompChim");
+    para->setMainKernel("CumulantK17CompChimStream");
+    *logging::out << logging::Logger::INFO_HIGH << "Kernel: " << para->getMainKernel() << "\n";
+
+    // if (para->getNumprocs() == 4) {
+    //     para->setDevices(std::vector<uint>{ 0u, 1u, 2u, 3u });
+    //     para->setMaxDev(4);
+    // } else if (para->getNumprocs() == 2) {
+    //     para->setDevices(std::vector<uint>{ 2u, 3u });
+    //     para->setMaxDev(2);
+    // } else
+    //     para->setDevices(std::vector<uint>{ 0u });
+    //     para->setMaxDev(1);
+
+    //////////////////////////////////////////////////////////////////////////
+
+    if (useGridGenerator) {
+        real sideLengthCube;
+        if (useLevels) {
+            if (scalingType == "strong")
+                sideLengthCube = 76.0; // Phoenix: strong scaling with two levels = 76.0
+            else if (scalingType == "weak")
+                sideLengthCube = 70.0; // Phoenix: weak scaling with two levels = 70.0
+        } else
+            sideLengthCube = 92.0; // Phoenix: 86.0
+        real xGridMin          = 0.0;
+        real xGridMax          = sideLengthCube;
+        real yGridMin          = 0.0;
+        real yGridMax          = sideLengthCube;
+        real zGridMin          = 0.0;
+        real zGridMax          = sideLengthCube;
+        const real dSphere     = 10.0;
+        const real dSphereLev1 = 22.0; // Phoenix: 22.0
+        const real dCubeLev1   = 72.0; // Phoenix: 72.0
+
+        if (para->getNumprocs() > 1) {
+            const uint generatePart = vf::gpu::Communicator::getInstance().getPID();
+
+            real overlap = (real)8.0 * dxGrid;
+            gridBuilder->setNumberOfLayers(10, 8);
+
+            if (communicator.getNummberOfProcess() == 2) {
+                real zSplit = 0.5 * sideLengthCube;
+
+                if (scalingType == "weak") {
+                    zSplit   = zGridMax;
+                    zGridMax = zGridMax + sideLengthCube;
+                }
+
+                if (generatePart == 0) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xGridMax, yGridMax, zSplit + overlap,
+                                               dxGrid);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zSplit - overlap, xGridMax, yGridMax, zGridMax,
+                                               dxGrid);
+                }
+
+                if (useLevels) {
+                    if (scalingType == "strong") {
+                        gridBuilder->addGrid(
+                            new Sphere(0.5 * sideLengthCube, 0.5 * sideLengthCube, 0.5 * sideLengthCube, dSphereLev1),
+                            1);
+                    } else if (scalingType == "weak") {
+                        gridBuilder->addGrid(new Cuboid(-0.5 * dCubeLev1, -0.5 * dCubeLev1,
+                                                        sideLengthCube - 0.5 * dCubeLev1, 0.5 * dCubeLev1,
+                                                        0.5 * dCubeLev1, sideLengthCube + 0.5 * dCubeLev1),
+                                             1);
+                    }
+                }
+
+                if (scalingType == "weak") {
+                    if (useLevels) {
+                        gridBuilder->addGeometry(new Sphere(0.0, 0.0, sideLengthCube, dSphere));
+                    } else {
+                        TriangularMesh *sphereSTL = TriangularMesh::make(stlPath + "Spheres_2GPU.stl");
+                        gridBuilder->addGeometry(sphereSTL);
+                    }
+                } else if (scalingType == "strong") {
+                    gridBuilder->addGeometry(
+                        new Sphere(0.5 * sideLengthCube, 0.5 * sideLengthCube, 0.5 * sideLengthCube, dSphere));
+                }
+
+                if (generatePart == 0)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xGridMax, yGridMin, yGridMax, zGridMin, zSplit));
+                if (generatePart == 1)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xGridMax, yGridMin, yGridMax, zSplit, zGridMax));
+
+                gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+
+                if (generatePart == 0) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 1);
+                }
+
+                if (generatePart == 1) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 0);
+                }
+
+                gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+                //////////////////////////////////////////////////////////////////////////
+                gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::MY, vxLB, 0.0, 0.0);
+                gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                if (generatePart == 0)
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                if (generatePart == 1)
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+                // gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
+                //////////////////////////////////////////////////////////////////////////
+
+            } else if (communicator.getNummberOfProcess() == 4) {
+                real ySplit = 0.5 * sideLengthCube;
+                real zSplit = 0.5 * sideLengthCube;
+
+                if (scalingType == "weak") {
+                    ySplit   = yGridMax;
+                    yGridMax = yGridMax + (yGridMax - yGridMin);
+                    zSplit   = zGridMax;
+                    zGridMax = zGridMax + (zGridMax - zGridMin);
+                }
+
+                if (generatePart == 0) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xGridMax, ySplit + overlap,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->addCoarseGrid(xGridMin, ySplit - overlap, zGridMin, xGridMax, yGridMax,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zSplit - overlap, xGridMax, ySplit + overlap,
+                                               zGridMax, dxGrid);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->addCoarseGrid(xGridMin, ySplit - overlap, zSplit - overlap, xGridMax, yGridMax,
+                                               zGridMax, dxGrid);
+                }
+
+                if (useLevels) {
+                    if (scalingType == "strong") {
+                        gridBuilder->addGrid(
+                            new Sphere(0.5 * sideLengthCube, 0.5 * sideLengthCube, 0.5 * sideLengthCube, dSphereLev1),
+                            1);
+                    } else if (scalingType == "weak") {
+                        gridBuilder->addGrid(new Cuboid(-0.5 * dCubeLev1, sideLengthCube - 0.5 * dCubeLev1,
+                                                        sideLengthCube - 0.5 * dCubeLev1, 0.5 * dCubeLev1,
+                                                        sideLengthCube + 0.5 * dCubeLev1,
+                                                        sideLengthCube + 0.5 * dCubeLev1),
+                                             1);
+                    }
+                }
+
+                if (scalingType == "weak") {
+                    if (useLevels) {
+                        gridBuilder->addGeometry(new Sphere(0.0, sideLengthCube, sideLengthCube, dSphere));
+                    } else {
+                        TriangularMesh *sphereSTL = TriangularMesh::make(stlPath + "Spheres_4GPU.stl");
+                        gridBuilder->addGeometry(sphereSTL);
+                    }
+                } else if (scalingType == "strong") {
+                    gridBuilder->addGeometry(
+                        new Sphere(0.5 * sideLengthCube, 0.5 * sideLengthCube, 0.5 * sideLengthCube, dSphere));
+                }
+
+                if (generatePart == 0)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xGridMax, yGridMin, ySplit, zGridMin, zSplit));
+                if (generatePart == 1)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xGridMax, ySplit, yGridMax, zGridMin, zSplit));
+                if (generatePart == 2)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xGridMax, yGridMin, ySplit, zSplit, zGridMax));
+                if (generatePart == 3)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xGridMax, ySplit, yGridMax, zSplit, zGridMax));
+
+                gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+                gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+
+                if (generatePart == 0) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 1);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 2);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 0);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 3);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 3);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 0);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 2);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 1);
+                }
+
+                //////////////////////////////////////////////////////////////////////////
+                if (generatePart == 0) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+                // gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
+                //////////////////////////////////////////////////////////////////////////
+            } else if (communicator.getNummberOfProcess() == 8) {
+                real xSplit = 0.5 * sideLengthCube;
+                real ySplit = 0.5 * sideLengthCube;
+                real zSplit = 0.5 * sideLengthCube;
+
+                if (scalingType == "weak") {
+                    xSplit   = xGridMax;
+                    xGridMax = xGridMax + (xGridMax - xGridMin);
+                    ySplit   = yGridMax;
+                    yGridMax = yGridMax + (yGridMax - yGridMin);
+                    zSplit   = zGridMax;
+                    zGridMax = zGridMax + (zGridMax - zGridMin);
+                }
+
+                if (generatePart == 0) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xSplit + overlap, ySplit + overlap,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->addCoarseGrid(xGridMin, ySplit - overlap, zGridMin, xSplit + overlap, yGridMax,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, yGridMin, zGridMin, xGridMax, ySplit + overlap,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, ySplit - overlap, zGridMin, xGridMax, yGridMax,
+                                               zSplit + overlap, dxGrid);
+                }
+                if (generatePart == 4) {
+                    gridBuilder->addCoarseGrid(xGridMin, yGridMin, zSplit - overlap, xSplit + overlap, ySplit + overlap,
+                                               zGridMax, dxGrid);
+                }
+                if (generatePart == 5) {
+                    gridBuilder->addCoarseGrid(xGridMin, ySplit - overlap, zSplit - overlap, xSplit + overlap, yGridMax,
+                                               zGridMax, dxGrid);
+                }
+                if (generatePart == 6) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, yGridMin, zSplit - overlap, xGridMax, ySplit + overlap,
+                                               zGridMax, dxGrid);
+                }
+                if (generatePart == 7) {
+                    gridBuilder->addCoarseGrid(xSplit - overlap, ySplit - overlap, zSplit - overlap, xGridMax, yGridMax,
+                                               zGridMax, dxGrid);
+                }
+
+                if (useLevels) {
+                    if (scalingType == "strong") {
+                        gridBuilder->addGrid(
+                            new Sphere(0.5 * sideLengthCube, 0.5 * sideLengthCube, 0.5 * sideLengthCube, dSphereLev1),
+                            1);
+                    } else if (scalingType == "weak") {
+                        gridBuilder->addGrid(
+                            new Cuboid(sideLengthCube - 0.5 * dCubeLev1, sideLengthCube - 0.5 * dCubeLev1,
+                                       sideLengthCube - 0.5 * dCubeLev1, sideLengthCube + 0.5 * dCubeLev1,
+                                       sideLengthCube + 0.5 * dCubeLev1, sideLengthCube + 0.5 * dCubeLev1),
+                            1);
+                    }
+                }
+
+                if (scalingType == "weak") {
+                    if (useLevels) {
+                        gridBuilder->addGeometry(new Sphere(sideLengthCube, sideLengthCube, sideLengthCube, dSphere));
+                    } else {
+                        TriangularMesh *sphereSTL = TriangularMesh::make(stlPath + "Spheres_8GPU.stl");
+                        gridBuilder->addGeometry(sphereSTL);
+                    }
+                } else if (scalingType == "strong") {
+                    gridBuilder->addGeometry(
+                        new Sphere(0.5 * sideLengthCube, 0.5 * sideLengthCube, 0.5 * sideLengthCube, dSphere));
+                }
+
+                if (generatePart == 0)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, yGridMin, ySplit, zGridMin, zSplit));
+                if (generatePart == 1)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, ySplit, yGridMax, zGridMin, zSplit));
+                if (generatePart == 2)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, yGridMin, ySplit, zGridMin, zSplit));
+                if (generatePart == 3)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, ySplit, yGridMax, zGridMin, zSplit));
+                if (generatePart == 4)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, yGridMin, ySplit, zSplit, zGridMax));
+                if (generatePart == 5)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xGridMin, xSplit, ySplit, yGridMax, zSplit, zGridMax));
+                if (generatePart == 6)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, yGridMin, ySplit, zSplit, zGridMax));
+                if (generatePart == 7)
+                    gridBuilder->setSubDomainBox(
+                        std::make_shared<BoundingBox>(xSplit, xGridMax, ySplit, yGridMax, zSplit, zGridMax));
+
+                gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+                gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+
+                if (generatePart == 0) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 1);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 2);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 4);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 0);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 3);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 5);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 3);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 0);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 6);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 2);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 1);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PZ, 7);
+                }
+                if (generatePart == 4) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 5);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 6);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 0);
+                }
+                if (generatePart == 5) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 4);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PX, 7);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 1);
+                }
+                if (generatePart == 6) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::PY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::PY, 7);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 4);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 2);
+                }
+                if (generatePart == 7) {
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MY, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MY, 6);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MX, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MX, 5);
+                    gridBuilder->findCommunicationIndices(CommunicationDirections::MZ, LBM);
+                    gridBuilder->setCommunicationProcess(CommunicationDirections::MZ, 3);
+                }
+
+                //////////////////////////////////////////////////////////////////////////
+                if (generatePart == 0) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 1) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 2) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0);
+                }
+                if (generatePart == 3) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0);
+                }
+                if (generatePart == 4) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 5) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                }
+                if (generatePart == 6) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::MY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+                }
+                if (generatePart == 7) {
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+                    gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+                    gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+                }
+                // gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
+                //////////////////////////////////////////////////////////////////////////
+            }
+            if (para->getKernelNeedsFluidNodeIndicesToRun())
+                gridBuilder->findFluidNodes(para->getUseStreams());
+
+            // gridBuilder->writeGridsToVtk(outPath + "grid/part" + std::to_string(generatePart) + "_");
+            // gridBuilder->writeGridsToVtk(outPath +std::to_string(generatePart) + "/grid/");
+            // gridBuilder->writeArrows(outPath + std::to_string(generatePart) + " /arrow");
+
+            SimulationFileWriter::write(gridPath + std::to_string(generatePart) + "/", gridBuilder, FILEFORMAT::BINARY);
+        } else {
+
+            gridBuilder->addCoarseGrid(xGridMin, yGridMin, zGridMin, xGridMax, yGridMax, zGridMax, dxGrid);
+
+            if (useLevels) {
+                gridBuilder->setNumberOfLayers(10, 8);
+                if (scalingType == "strong") {
+                    gridBuilder->addGrid(
+                        new Sphere(0.5 * sideLengthCube, 0.5 * sideLengthCube, 0.5 * sideLengthCube, dSphereLev1), 1);
+                } else if (scalingType == "weak")
+                    gridBuilder->addGrid(new Cuboid(sideLengthCube - 0.5 * dCubeLev1, sideLengthCube - 0.5 * dCubeLev1,
+                                                    sideLengthCube - 0.5 * dCubeLev1, sideLengthCube + 0.5 * dCubeLev1,
+                                                    sideLengthCube + 0.5 * dCubeLev1, sideLengthCube + 0.5 * dCubeLev1),
+                                         1);
+            }
+
+            if (scalingType == "weak") {
+                if (useLevels) {
+                    gridBuilder->addGeometry(new Sphere(sideLengthCube, sideLengthCube, sideLengthCube, dSphere));
+                } else {
+                    TriangularMesh *sphereSTL = TriangularMesh::make(stlPath + "Spheres_1GPU.stl");
+                    gridBuilder->addGeometry(sphereSTL);
+                }
+            } else {
+                gridBuilder->addGeometry(
+                    new Sphere(0.5 * sideLengthCube, 0.5 * sideLengthCube, 0.5 * sideLengthCube, dSphere));
+            }
+
+            gridBuilder->buildGrids(LBM, true); // buildGrids() has to be called before setting the BCs!!!!
+
+            gridBuilder->setPeriodicBoundaryCondition(false, false, false);
+            //////////////////////////////////////////////////////////////////////////
+            gridBuilder->setVelocityBoundaryCondition(SideType::PY, vxLB, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::MY, vxLB, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::MX, vxLB, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::MZ, vxLB, 0.0, 0.0);
+            gridBuilder->setVelocityBoundaryCondition(SideType::PZ, vxLB, 0.0, 0.0);
+            gridBuilder->setPressureBoundaryCondition(SideType::PX, 0.0); // set pressure BC after velocity BCs
+
+            // gridBuilder->setVelocityBoundaryCondition(SideType::GEOMETRY, 0.0, 0.0, 0.0);
+            //////////////////////////////////////////////////////////////////////////
+            if (para->getKernelNeedsFluidNodeIndicesToRun())
+                gridBuilder->findFluidNodes(para->getUseStreams());
+
+            // gridBuilder->writeGridsToVtk("E:/temp/MusselOyster/" + "/grid/");
+            // gridBuilder->writeArrows ("E:/temp/MusselOyster/" + "/arrow");
+
+            SimulationFileWriter::write(gridPath, gridBuilder, FILEFORMAT::BINARY);
+        }
+    }
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    SPtr<CudaMemoryManager> cudaMemoryManager = CudaMemoryManager::make(para);
+
+    SPtr<GridProvider> gridGenerator;
+    if (useGridGenerator)
+        gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
+    else {
+        gridGenerator = GridProvider::makeGridReader(FILEFORMAT::BINARY, para, cudaMemoryManager);
+    }
+
+    Simulation sim(communicator);
+    SPtr<FileWriter> fileWriter                      = SPtr<FileWriter>(new FileWriter());
+    SPtr<KernelFactoryImp> kernelFactory             = KernelFactoryImp::getInstance();
+    SPtr<PreProcessorFactoryImp> preProcessorFactory = PreProcessorFactoryImp::getInstance();
+    sim.setFactories(kernelFactory, preProcessorFactory);
+    sim.init(para, gridGenerator, fileWriter, cudaMemoryManager);
+    sim.run();
+    sim.free();
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+}
+
+int main(int argc, char *argv[])
+{
+    MPI_Init(&argc, &argv);
+    std::string str, str2, configFile;
+
+    if (argv != NULL) {
+
+        try {
+            //////////////////////////////////////////////////////////////////////////
+
+            std::string targetPath;
+
+            targetPath = __FILE__;
+
+            if (argc == 2) {
+                configFile = argv[1];
+                std::cout << "Using configFile command line argument: " << configFile << std::endl;
+            }
+
+#ifdef _WIN32
+            targetPath = targetPath.substr(0, targetPath.find_last_of('\\') + 1);
+#else
+            targetPath = targetPath.substr(0, targetPath.find_last_of('/') + 1);
+#endif
+
+            std::cout << targetPath << std::endl;
+
+            if (configFile.size() == 0) {
+                configFile = targetPath + "config.txt";
+            }
+
+            multipleLevel(configFile);
+
+            //////////////////////////////////////////////////////////////////////////
+        } catch (const std::bad_alloc &e) {
+            *logging::out << logging::Logger::LOGGER_ERROR << "Bad Alloc:" << e.what() << "\n";
+        } catch (const std::exception &e) {
+            *logging::out << logging::Logger::LOGGER_ERROR << e.what() << "\n";
+        } catch (...) {
+            *logging::out << logging::Logger::LOGGER_ERROR << "Unknown exception!\n";
+        }
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/apps/gpu/LBM/SphereScaling/config.txt b/apps/gpu/LBM/SphereScaling/config.txt
new file mode 100644
index 0000000000000000000000000000000000000000..44c5fedb297cc62f8b1b5d26c075bd5172cac081
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/config.txt
@@ -0,0 +1,46 @@
+# Tesla 03
+# mpiexec -n 2 "C:/Users/Master/Documents/MasterAnna/VirtualFluids_dev/build/bin/Release/SphereScaling.exe" "C:/Users/Master/Documents/MasterAnna/VirtualFluids_dev/apps/gpu/LBM/SphereScaling/config.txt"
+# Phoenix
+# mpirun -np 2 "./VirtualFluids_dev/build/bin/SphereScaling" "./VirtualFluids_dev/apps/gpu/LBM/SphereScaling/config.txt"
+
+# Phoenix mpich
+# mpirun -np 2 nvprof -f -o SphereScaling.%q{PMI_RANK}.nvprof "./VirtualFluids_dev/build/bin/SphereScaling" "./VirtualFluids_dev/apps/gpu/LBM/SphereScaling/configPhoenix4GPU.txt"
+# Phoenix openmpi
+# mpirun -np 2 nvprof -f -o SphereScaling.%q{OMPI_COMM_WORLD_RANK}.nvprof "./VirtualFluids_dev/build/bin/SphereScaling" "./VirtualFluids_dev/apps/gpu/LBM/SphereScaling/configPhoenix4GPU.txt"
+
+##################################################
+#GPU Mapping
+##################################################
+#Devices="0 1 2 3"
+#NumberOfDevices=2
+
+##################################################
+#informations for Writing
+##################################################
+#Path="E:/temp/SphereScalingResults/"
+Path=/work/y0078217/Results/SphereScalingResults/
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/
+#GridPath=E:/temp/GridSphereScaling/
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+#TimeEnd=10
+#TimeOut=10 
+#TimeStartOut=0
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix1GPU_1LevStrongOS.txt b/apps/gpu/LBM/SphereScaling/configPhoenix1GPU_1LevStrongOS.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5488797815bd797916434e8b6a0a82ce623a8db4
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix1GPU_1LevStrongOS.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=2
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/SphereScalingResults/1GPU/
+#Path="F:/Work/Computations/out/SphereScaling/"
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/SphereScaling1GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=100000
+TimeOut=100000
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = false
+useReducedCommunicationInInterpolation = false
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix1GPU_1LevStrongStream.txt b/apps/gpu/LBM/SphereScaling/configPhoenix1GPU_1LevStrongStream.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e93f161aa16977ecd65aab230f40db0bbef60130
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix1GPU_1LevStrongStream.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=2
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/SphereScalingResults/1GPU/
+#Path="F:/Work/Computations/out/SphereScaling/"
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/SphereScaling1GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=100000
+TimeOut=100000
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = true
+useReducedCommunicationInInterpolation = true
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix2GPU_1LevStrongOS.txt b/apps/gpu/LBM/SphereScaling/configPhoenix2GPU_1LevStrongOS.txt
new file mode 100644
index 0000000000000000000000000000000000000000..795e6bcb7d0dc3314f26b171c2f61d88e005a797
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix2GPU_1LevStrongOS.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=2
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/SphereScalingResults/4GPU/1LevStrongStream/
+#Path="F:/Work/Computations/out/SphereScaling/"
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/SphereScaling4GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=100000
+TimeOut=100000
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = false
+useReducedCommunicationInInterpolation = false
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix2GPU_1LevStrongStream.txt b/apps/gpu/LBM/SphereScaling/configPhoenix2GPU_1LevStrongStream.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ef75fb88e563869b67f8aa33d839ec85c1d749b6
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix2GPU_1LevStrongStream.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=2
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/SphereScalingResults/4GPU/1LevStrongStream/
+#Path="F:/Work/Computations/out/SphereScaling/"
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/SphereScaling4GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=100000
+TimeOut=100000 
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = true
+useReducedCommunicationInInterpolation = true
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_1LevStrongOS.txt b/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_1LevStrongOS.txt
new file mode 100644
index 0000000000000000000000000000000000000000..99a057d31c7f15659d32776967853e076b5939ee
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_1LevStrongOS.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/SphereScalingResults/4GPU/1LevStrongStream/
+#Path="F:/Work/Computations/out/SphereScaling/"
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/SphereScaling4GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=100000
+TimeOut=100000
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = false
+useReducedCommunicationInInterpolation = false
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_1LevStrongStream.txt b/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_1LevStrongStream.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ea6338672305177b5119a2f557675bc491fddadc
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_1LevStrongStream.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/SphereScalingResults/4GPU/1LevStrongStream/
+#Path="F:/Work/Computations/out/SphereScaling/"
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/SphereScaling4GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=100000
+TimeOut=100000
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = true
+useReducedCommunicationInInterpolation = true
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_1LevWeakStream.txt b/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_1LevWeakStream.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ad05efa37ec5fadc9bc5fe9711485ec6f03e1960
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_1LevWeakStream.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/SphereScalingResults/4GPU/1LevWeakStream/
+#Path="F:/Work/Computations/out/SphereScaling/"
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/SphereScaling4GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=100000
+TimeOut=100000 
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = true
+useReducedCommunicationInInterpolation = true
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_1LevStrongOS.txt b/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_1LevStrongOS.txt
new file mode 100644
index 0000000000000000000000000000000000000000..892f11013d6742af416ba3b93a993b059a6fa3a0
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_1LevStrongOS.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/SphereScalingResults/8GPU/1LevStrongOS/
+#Path="F:/Work/Computations/out/SphereScaling/"
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/SphereScaling8GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=100000
+TimeOut=100000 
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = false
+useReducedCommunicationInInterpolation = false
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_1LevStrongStream.txt b/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_1LevStrongStream.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b026d6b7304f9f13effec6c899512beb804787f5
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_1LevStrongStream.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/SphereScalingResults/8GPU/1LevStrongStream/
+#Path="F:/Work/Computations/out/SphereScaling/"
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/SphereScaling8GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=1000
+TimeOut=1000
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = true
+useReducedCommunicationInInterpolation = true
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_1LevWeakOS.txt b/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_1LevWeakOS.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ae848a2889d1301de78c6fff42e045965fa9baf7
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_1LevWeakOS.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/SphereScalingResults/8GPU/1LevWeakOS/
+#Path="F:/Work/Computations/out/SphereScaling/"
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/SphereScaling8GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=100000
+TimeOut=100000 
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = false
+useReducedCommunicationInInterpolation = false
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_1LevWeakStream.txt b/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_1LevWeakStream.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fc8403eca0bcf96645c85b81c3109ec7619f34d2
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_1LevWeakStream.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/SphereScalingResults/8GPU/1LevWeakStream/
+#Path="F:/Work/Computations/out/SphereScaling/"
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/SphereScaling8GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=100
+TimeOut=100
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = true
+useReducedCommunicationInInterpolation = true
\ No newline at end of file
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_2LevStrongStream.txt b/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_2LevStrongStream.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a6d54810d8d4f0ded262a61c5535764c2f6f91b3
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix8GPU_2LevStrongStream.txt
@@ -0,0 +1,42 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=4
+
+##################################################
+#informations for Writing
+##################################################
+Path=/work/y0078217/Results/SphereScalingResults/8GPU/2LevStrongStream/
+#Path="F:/Work/Computations/out/SphereScaling/"
+#Prefix="SphereScaling" 
+#WriteGrid=true
+##################################################
+#informations for reading
+##################################################
+GridPath=/work/y0078217/Grids/GridSphereScaling/SphereScaling8GPU/
+#GridPath="C:"
+
+##################################################
+#number of grid levels
+##################################################
+#NOGL=1
+
+##################################################
+#LBM Version
+##################################################
+#D3Qxx=27
+#MainKernelName=CumulantK17CompChim
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=10000
+TimeOut=10000
+#TimeStartOut=0
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = true
+useReducedCommunicationInInterpolation = true
\ No newline at end of file
diff --git a/apps/gpu/LBM/TGV_3D/TGV_3D.cpp b/apps/gpu/LBM/TGV_3D/TGV_3D.cpp
index 7c23a683d7966a7e5c3d10b6025a8fa5ed802dac..db51215de1e4b5c0060171191ef68a0d3817b88d 100644
--- a/apps/gpu/LBM/TGV_3D/TGV_3D.cpp
+++ b/apps/gpu/LBM/TGV_3D/TGV_3D.cpp
@@ -253,7 +253,7 @@ void multipleLevel(const std::string& configPath)
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     
     SPtr<CudaMemoryManager> cudaMemoryManager = CudaMemoryManager::make(para);
-    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
     //SPtr<GridProvider> gridGenerator = GridProvider::makeGridReader(FILEFORMAT::BINARY, para, cudaMemoryManager);
 
     Simulation sim;
diff --git a/apps/gpu/LBM/TGV_3D_MultiGPU/TGV_3D_MultiGPU.cpp b/apps/gpu/LBM/TGV_3D_MultiGPU/TGV_3D_MultiGPU.cpp
index 3e761d1b8a5164af90f2010a941d85834fd6361f..8f1c208124459186c92908c04a00dbabb059f03f 100644
--- a/apps/gpu/LBM/TGV_3D_MultiGPU/TGV_3D_MultiGPU.cpp
+++ b/apps/gpu/LBM/TGV_3D_MultiGPU/TGV_3D_MultiGPU.cpp
@@ -314,7 +314,7 @@ void multipleLevel(const std::string& configPath)
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     
     SPtr<CudaMemoryManager> cudaMemoryManager = CudaMemoryManager::make(para);
-    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
     //SPtr<GridProvider> gridGenerator = GridProvider::makeGridReader(FILEFORMAT::BINARY, para, cudaMemoryManager);
 
     Simulation sim;
diff --git a/apps/gpu/LBM/WTG_RUB/WTG_RUB.cpp b/apps/gpu/LBM/WTG_RUB/WTG_RUB.cpp
index 565ce4ba9cd9c31d9ff3bb4b3e4f0fc32b4a58f0..533ce7b612ca5301f6b4921da9f27191805f5465 100644
--- a/apps/gpu/LBM/WTG_RUB/WTG_RUB.cpp
+++ b/apps/gpu/LBM/WTG_RUB/WTG_RUB.cpp
@@ -86,8 +86,11 @@ std::string simulationName("");
 // 4: setup 3 of MSch (small/test)  (3 level, 4.0 cm -> 1.0  cm)
 int setupDomain = 4;
 
-std::string path("D:/out/WTG_RUB"); //Mollok
-std::string inputPath("D:/out/WTG_RUB/input/");
+// std::string path("D:/out/WTG_RUB"); // Mollok
+// std::string inputPath("D:/out/WTG_RUB/input/");
+
+std::string path("/workspaces/VirtualFluids_dev/output/WTG_RUB_Results/"); // Aragorn
+std::string inputPath("/workspaces/VirtualFluids_dev/stl/WTG_RUB/");
 
 // const uint timeStepStartOut = 0;
 const uint timeStepOut = 10000;
@@ -329,7 +332,7 @@ void multipleLevel(const std::string& configPath)
 
     SPtr<CudaMemoryManager> cudaMemoryManager = CudaMemoryManager::make(para);
 
-    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
 
     Simulation sim (communicator);
     SPtr<FileWriter> fileWriter = SPtr<FileWriter>(new FileWriter());
diff --git a/apps/gpu/LBM/gridGeneratorTest/CMakeLists.txt b/apps/gpu/LBM/gridGeneratorTest/CMakeLists.txt
index b05d36ee5a0d47a21ee5b96fb4fbb2cb0485ae38..6493b72cfd996a6866fe0fb07291dd2e3438dd03 100644
--- a/apps/gpu/LBM/gridGeneratorTest/CMakeLists.txt
+++ b/apps/gpu/LBM/gridGeneratorTest/CMakeLists.txt
@@ -1,7 +1,5 @@
-PROJECT(GridGeneratorTest)
+PROJECT(GridGeneratorTest LANGUAGES CUDA CXX)
 
-vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator FILES gridGenerator.cpp)
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES gridGenerator.cpp)
 
-set_source_files_properties(gridGenerator.cpp PROPERTIES LANGUAGE CUDA)
-
-linkCUDA()
+set_source_files_properties(DrivenCavity.cpp PROPERTIES LANGUAGE CUDA)
diff --git a/apps/gpu/LBM/gridGeneratorTest/gridGenerator.cpp b/apps/gpu/LBM/gridGeneratorTest/gridGenerator.cpp
index 401ffc1efe60aded6341ee18de056b96d4b986da..aba4752df9796c422302fdd4c37deb3c01dcb0c3 100644
--- a/apps/gpu/LBM/gridGeneratorTest/gridGenerator.cpp
+++ b/apps/gpu/LBM/gridGeneratorTest/gridGenerator.cpp
@@ -31,7 +31,6 @@
 #include "VirtualFluids_GPU/PreProcessor/PreProcessorFactory/PreProcessorFactoryImp.h"
 
 #include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
-#include "VirtualFluids_GPU/Kernel/Utilities/Mapper/KernelMapper/KernelMapper.h"
 
 #include "global.h"
 
@@ -79,11 +78,9 @@ void multipleLevel(const std::string& configPath)
 
     auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
     
-	Communicator* comm = Communicator::getInstanz();
+	vf::gpu::Communicator *comm         = vf::gpu::Communicator::getInstanz();
 	SPtr<ConfigFileReader> configReader = ConfigFileReader::getNewInstance();
-	SPtr<ConfigData> configData = configReader->readConfigFile(configPath);
-
-    std::shared_ptr<KernelMapper> kernelMapper = KernelMapper::getInstance();
+    SPtr<ConfigData> configData         = configReader->readConfigFile(configPath.c_str());
 
     SPtr<Parameter> para = Parameter::make(configData, comm);
 
@@ -208,7 +205,7 @@ void multipleLevel(const std::string& configPath)
 
             para->setUseWale(false);
 
-            para->setMainKernel(kernelMapper->getEnum("CumulantK15Comp"));
+            para->setMainKernel("CumulantK15Comp");
 
             //////////////////////////////////////////////////////////////////////////
 
@@ -291,7 +288,7 @@ void multipleLevel(const std::string& configPath)
 
             para->setUseWale(false);
 
-            para->setMainKernel(kernelMapper->getEnum("CumulantK20Comp"));
+            para->setMainKernel("CumulantK20Comp");
 
             ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -361,7 +358,7 @@ void multipleLevel(const std::string& configPath)
             //SimulationFileWriter::write("grid/", gridBuilder, FILEFORMAT::ASCII);
 
             //return;
-            //gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+            //gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
         }
 
         //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -392,7 +389,7 @@ void multipleLevel(const std::string& configPath)
 
             para->setUseWale(false);
 
-            para->setMainKernel(kernelMapper->getEnum("CumulantAA2016CompSP27"));
+            para->setMainKernel("CumulantAA2016CompSP27");
 
             ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -437,7 +434,7 @@ void multipleLevel(const std::string& configPath)
             //SimulationFileWriter::write("grid/", gridBuilder, FILEFORMAT::ASCII);
 
             //return;
-            //gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+            //gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
         }
 
         //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -468,7 +465,7 @@ void multipleLevel(const std::string& configPath)
 
             para->setUseWale(false);
 
-            para->setMainKernel(kernelMapper->getEnum("CumulantAA2016CompSP27"));
+            para->setMainKernel("CumulantAA2016CompSP27");
             //para->setMainKernel(kernelMapper->getEnum("CumulantOneCompSP27"));
 
             ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -626,7 +623,7 @@ void multipleLevel(const std::string& configPath)
             //SimulationFileWriter::write("C:/Users/lenz/Desktop/Work/gridGenerator/grid/", gridBuilder, FILEFORMAT::ASCII);
             SimulationFileWriter::write("grid/", gridBuilder, FILEFORMAT::ASCII);
             
-            //gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+            //gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
         }
 
         //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -659,13 +656,13 @@ void multipleLevel(const std::string& configPath)
 
             para->setUseWale(false);
 
-            para->setMainKernel(kernelMapper->getEnum("CumulantK15Comp"));
+            para->setMainKernel("CumulantK15Comp");
 
             para->setDevices( { 0, 1 } );
             para->setMaxDev(2);
 
             //const uint generatePart = 1;
-            const uint generatePart = Communicator::getInstanz()->getPID();
+            const uint generatePart = vf::gpu::Communicator::getInstanz()->getPID();
             
             std::ofstream logFile2;
             
@@ -762,13 +759,13 @@ void multipleLevel(const std::string& configPath)
 
             //return;
             
-            //gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+            //gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
         }
 
     }
     else
     {
-        //gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+        //gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
         //gridGenerator = GridProvider::makeGridReader(FILEFORMAT::BINARY, para, cudaMemoryManager);
     }
 
@@ -783,7 +780,7 @@ void multipleLevel(const std::string& configPath)
     SPtr<CudaMemoryManager> cudaMemoryManager = CudaMemoryManager::make(para);
 
     SPtr<GridProvider> gridGenerator;
-    if( useGridGenerator ) gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+    if( useGridGenerator ) gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
     else                   gridGenerator = GridProvider::makeGridReader(FILEFORMAT::BINARY, para, cudaMemoryManager);
 
     Simulation sim;
diff --git a/apps/gpu/LidDrivenCavity/LidDrivenCavity.cpp b/apps/gpu/LidDrivenCavity/LidDrivenCavity.cpp
index 8f11d8a197be080e59ab1e445596341b579ea66d..2c63892ffb0bc0ead8d18c69b597aa3760fb3d53 100644
--- a/apps/gpu/LidDrivenCavity/LidDrivenCavity.cpp
+++ b/apps/gpu/LidDrivenCavity/LidDrivenCavity.cpp
@@ -199,7 +199,7 @@ int main( int argc, char* argv[])
 
 		    SPtr<CudaMemoryManager> cudaMemoryManager = CudaMemoryManager::make(para);
 
-            SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+            SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager, communicator);
     
             //////////////////////////////////////////////////////////////////////////
             // run simulation
diff --git a/gpu.cmake b/gpu.cmake
index 4a1b1a9eb070dcb85ff0c4147fa3b272372a2da9..5b70ed9da9cb5f0ac56d09a0f91f0a6b6d13b89a 100644
--- a/gpu.cmake
+++ b/gpu.cmake
@@ -32,12 +32,16 @@ IF (BUILD_VF_GPU)
     #add_subdirectory(targets/apps/LBM/BaselMultiGPU)
 
     add_subdirectory(apps/gpu/LBM/DrivenCavity)
-    add_subdirectory(apps/gpu/LBM/WTG_RUB)
+    #add_subdirectory(apps/gpu/LBM/WTG_RUB)
     #add_subdirectory(apps/gpu/LBM/gridGeneratorTest)
     #add_subdirectory(apps/gpu/LBM/TGV_3D)
     #add_subdirectory(apps/gpu/LBM/TGV_3D_MultiGPU)
+	#add_subdirectory(apps/gpu/LBM/SphereScaling)
+    #add_subdirectory(apps/gpu/LBM/DrivenCavityMultiGPU)
+	#add_subdirectory(apps/gpu/LBM/MusselOyster)
+    #add_subdirectory(apps/gpu/LBM/Poiseuille)
     #add_subdirectory(apps/gpu/LBM/ActuatorLine)
-    add_subdirectory(apps/gpu/LBM/BoundaryLayer)
+    #add_subdirectory(apps/gpu/LBM/BoundaryLayer)
 ELSE()
     MESSAGE( STATUS "exclude Virtual Fluids GPU." )
 ENDIF()
diff --git a/src/gpu/GridGenerator/geometries/Vertex/Vertex.h b/src/gpu/GridGenerator/geometries/Vertex/Vertex.h
index 7b27d853f652459143699204c59a5843de6eaf39..ec5fc0f1ced64f7757de26deaf3053504e29d7c6 100644
--- a/src/gpu/GridGenerator/geometries/Vertex/Vertex.h
+++ b/src/gpu/GridGenerator/geometries/Vertex/Vertex.h
@@ -37,7 +37,7 @@
 #include <memory>
 #include <ostream>
 
-#include "global.h"
+#include "gpu/GridGenerator/global.h"
 
 struct GRIDGENERATOR_EXPORT Vertex
 {
diff --git a/src/gpu/GridGenerator/grid/Cell.h b/src/gpu/GridGenerator/grid/Cell.h
index a7a64917025742217be88e7ebe1ccc9e669fc7c0..f39bd4a7bb13be0d768e092276194e6bad16dcff 100644
--- a/src/gpu/GridGenerator/grid/Cell.h
+++ b/src/gpu/GridGenerator/grid/Cell.h
@@ -33,7 +33,7 @@
 #ifndef CELL_H
 #define CELL_H
 
-#include "global.h"
+#include "gpu/GridGenerator/global.h"
 
 struct Point
 {
diff --git a/src/gpu/GridGenerator/grid/Field.h b/src/gpu/GridGenerator/grid/Field.h
index 002c8c108bd405f4077cd2779f5e59232135ace9..08fff6da7c5a3f431138dc5039b4d234493ae4b8 100644
--- a/src/gpu/GridGenerator/grid/Field.h
+++ b/src/gpu/GridGenerator/grid/Field.h
@@ -33,7 +33,7 @@
 #ifndef FIELD_H
 #define FIELD_H
 
-#include "global.h"
+#include "gpu/GridGenerator/global.h"
 
 struct Vertex;
 
diff --git a/src/gpu/GridGenerator/grid/Grid.h b/src/gpu/GridGenerator/grid/Grid.h
index 3407b23c1efbb4143d06ded7880c26d7c0eb6599..3f28120a5d969fcc5d7b2a3402a2169ff97c0cc3 100644
--- a/src/gpu/GridGenerator/grid/Grid.h
+++ b/src/gpu/GridGenerator/grid/Grid.h
@@ -28,18 +28,18 @@
 //
 //! \file Grid.h
 //! \ingroup grid
-//! \author Soeren Peters, Stephan Lenz, Martin Schönherr
+//! \author Soeren Peters, Stephan Lenz, Martin Sch�nherr
 //=======================================================================================
 #ifndef GRID_H
 #define GRID_H
 
 #include "Core/LbmOrGks.h"
 
-#include "global.h"
+#include "gpu/GridGenerator/global.h"
 
-#include "geometries/Vertex/Vertex.h"
+#include "gpu/GridGenerator/geometries/Vertex/Vertex.h"
 
-#include "grid/Cell.h"
+#include "gpu/GridGenerator/grid/Cell.h"
 
 class TriangularMesh;
 struct Vertex;
@@ -82,6 +82,7 @@ public:
     virtual void setFieldEntry(uint matrixIndex, char type) = 0;
 
     virtual void getGridInterfaceIndices(uint* iCellCfc, uint* iCellCff, uint* iCellFcc, uint* iCellFcf) const = 0;
+    virtual bool isSparseIndexInFluidNodeIndicesBorder(uint &sparseIndex) const = 0;
 
     virtual int *getNeighborsX() const = 0;
     virtual int *getNeighborsY() const = 0;
@@ -162,11 +163,21 @@ public:
     virtual uint getNumberOfSendNodes(int direction)    = 0;
     virtual uint getNumberOfReceiveNodes(int direction) = 0;
 
+    virtual bool isSendNode(int index) const                = 0;
+    virtual bool isReceiveNode(int index) const             = 0;
     virtual uint getSendIndex(int direction, uint index)    = 0;
     virtual uint getReceiveIndex(int direction, uint index) = 0;
 
-    virtual void repairCommunicationInices(int direction) = 0;
+    virtual void repairCommunicationIndices(int direction) = 0;
 
+    // needed for CUDA Streams 
+    virtual void findFluidNodeIndices(bool onlyBulk) = 0;
+    virtual uint getNumberOfFluidNodes() const = 0;
+    virtual void getFluidNodeIndices(uint *fluidNodeIndices) const = 0;
+
+    virtual void findFluidNodeIndicesBorder() = 0;
+    virtual uint getNumberOfFluidNodesBorder() const = 0;
+    virtual void getFluidNodeIndicesBorder(uint *fluidNodeIndicesBorder) const = 0;
 };
 
 #endif
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
index a5ee3943f23ed4e9ffa1acb92ffc525e9de7780c..6fe296ebdc90902b7dfe10ef95b0238532c6dd3c 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
@@ -37,7 +37,7 @@
 #include <string>
 #include <memory>
 
-#include "global.h"
+#include "gpu/GridGenerator/global.h"
 
 #define GEOMQS 6
 #define INLETQS 0
@@ -125,11 +125,15 @@ public:
 
     virtual uint getCommunicationProcess(int direction) = 0;
 
+    virtual uint getNumberOfFluidNodes(unsigned int level) const = 0;
+    virtual void getFluidNodeIndices(uint *fluidNodeIndices, const int level) const = 0;
+    virtual uint getNumberOfFluidNodesBorder(unsigned int level) const = 0;
+    virtual void getFluidNodeIndicesBorder(uint *fluidNodeIndices, const int level) const = 0;
+
     virtual uint getNumberOfSendIndices(int direction, uint level)             = 0;
     virtual uint getNumberOfReceiveIndices(int direction, uint level)          = 0;
     virtual void getSendIndices(int *sendIndices, int direction, int level)    = 0;
     virtual void getReceiveIndices(int *sendIndices, int direction, int level) = 0;
 };
 
-#endif
-
+#endif
\ No newline at end of file
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
index 30156a7c65ffff00fec92ec1d8a7644236756488..4c1574fc02c01fa634915a5fb66dbd88064ccd80 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
@@ -34,6 +34,7 @@
 
 #include <stdio.h>
 #include <iostream>
+#include <algorithm>
 
 #include "geometries/Arrow/ArrowImp.h"
 #include "geometries/BoundingBox/BoundingBox.h"
@@ -315,7 +316,6 @@ uint LevelGridBuilder::getNumberOfNodes(unsigned int level) const
     return grids[level]->getSparseSize();
 }
 
-
 std::shared_ptr<Grid> LevelGridBuilder::getGrid(int level, int box)
 {
     return this->grids[level];
@@ -345,6 +345,26 @@ void LevelGridBuilder::getNodeValues(real *xCoords, real *yCoords, real *zCoords
 }
 
 
+GRIDGENERATOR_EXPORT void LevelGridBuilder::getFluidNodeIndices(uint *fluidNodeIndices, const int level) const 
+{ 
+    grids[level]->getFluidNodeIndices(fluidNodeIndices);
+}
+
+GRIDGENERATOR_EXPORT void LevelGridBuilder::getFluidNodeIndicesBorder(uint *fluidNodeIndices, const int level) const
+{
+    grids[level]->getFluidNodeIndicesBorder(fluidNodeIndices);
+}
+
+uint LevelGridBuilder::getNumberOfFluidNodes(unsigned int level) const 
+{
+    return grids[level]->getNumberOfFluidNodes(); 
+}
+
+GRIDGENERATOR_EXPORT uint LevelGridBuilder::getNumberOfFluidNodesBorder(unsigned int level) const
+{
+    return grids[level]->getNumberOfFluidNodesBorder();
+}
+
 uint LevelGridBuilder::getSlipSize(int level) const
 {
     uint size = 0;
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
index f3d21cf130aaaf5caac78c8828f35951ebd4e510..a11f0d20da1cb671dfcb073d85e4add1da9646fb 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
@@ -38,11 +38,12 @@
 #include <memory>
 #include <array>
 
-#include "global.h"
+#include "gpu/GridGenerator/global.h"
 
-#include "grid/GridBuilder/GridBuilder.h"
-#include "grid/Grid.h"
-#include "grid/NodeValues.h"
+#include "gpu/GridGenerator/grid/GridBuilder/GridBuilder.h"
+#include "gpu/GridGenerator/grid/Grid.h"
+#include "gpu/GridGenerator/grid/GridInterface.h"
+#include "gpu/GridGenerator/grid/NodeValues.h"
 
 struct Vertex;
 class  Grid;
@@ -90,6 +91,10 @@ public:
 
     GRIDGENERATOR_EXPORT virtual unsigned int getNumberOfNodes(unsigned int level) const override;
 
+    GRIDGENERATOR_EXPORT virtual uint getNumberOfFluidNodes(unsigned int level) const override;
+    GRIDGENERATOR_EXPORT virtual void getFluidNodeIndices(uint* fluidNodeIndices, const int level) const override;
+    GRIDGENERATOR_EXPORT virtual uint getNumberOfFluidNodesBorder(unsigned int level) const override;
+    GRIDGENERATOR_EXPORT virtual void getFluidNodeIndicesBorder(uint *fluidNodeIndices, const int level) const override;
 
     GRIDGENERATOR_EXPORT virtual void getNodeValues(real *xCoords, real *yCoords, real *zCoords,
                                          uint *neighborX, uint *neighborY, uint *neighborZ, uint *neighborNegative, 
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/MultipleGridBuilder.cpp b/src/gpu/GridGenerator/grid/GridBuilder/MultipleGridBuilder.cpp
index da18a883181069f089e7232c9cd1b4f19cc9dc35..0d903b9a4fdb067155dd6b9ee6c60257a63b3ad0 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/MultipleGridBuilder.cpp
+++ b/src/gpu/GridGenerator/grid/GridBuilder/MultipleGridBuilder.cpp
@@ -626,6 +626,14 @@ void MultipleGridBuilder::findCommunicationIndices(int direction, LbmOrGks lbmOr
     *logging::out << logging::Logger::INFO_HIGH << "Done with findCommunicationIndices()\n";
 }
 
+void MultipleGridBuilder::findFluidNodes(bool splitDomain)
+{
+    *logging::out << logging::Logger::INFO_HIGH << "Start findFluidNodes()\n";
+    for (uint i = 0; i < grids.size(); i++)
+        grids[i]->findFluidNodeIndices(splitDomain);
+    *logging::out << logging::Logger::INFO_HIGH << "Done with findFluidNodes()\n";
+}
+
 void MultipleGridBuilder::writeGridsToVtk(const std::string& path) const
 {
     for(uint level = 0; level < grids.size(); level++)
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/MultipleGridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/MultipleGridBuilder.h
index 9627fb0bf7e97a925d4b0ba2c450c507426a48f4..e28be0087b44d599a792f2f265d3286b650eca63 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/MultipleGridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/MultipleGridBuilder.h
@@ -114,6 +114,9 @@ private:
 
 public:
     GRIDGENERATOR_EXPORT void findCommunicationIndices(int direction, LbmOrGks lbmOrGks);
+
+    // needed for CUDA Streams MultiGPU
+    void findFluidNodes(bool splitDomain);
 };
 
 #endif
diff --git a/src/gpu/GridGenerator/grid/GridImp.cpp b/src/gpu/GridGenerator/grid/GridImp.cpp
index 7eda4f9b8e5a374347b8572f3a28a947be5ad9cb..f6afafcd521245222c33972dcb46a9e9b2879826 100644
--- a/src/gpu/GridGenerator/grid/GridImp.cpp
+++ b/src/gpu/GridGenerator/grid/GridImp.cpp
@@ -37,6 +37,7 @@
 #include <iostream>
 #include <omp.h>
 #include <sstream>
+# include <algorithm>
 #include <cmath>
 
 #include "global.h"
@@ -907,6 +908,53 @@ void GridImp::updateSparseIndices()
     sparseSize = size - removedNodes;
 }
 
+void GridImp::findFluidNodeIndices(bool splitDomain) 
+{
+    // find sparse index of all fluid nodes
+    this->fluidNodeIndices.clear();
+    for (uint index = 0; index < this->size; index++) {
+        int sparseIndex = this->getSparseIndex(index);
+        if (sparseIndex == -1)
+            continue;
+        if (this->field.isFluid(index))
+            this->fluidNodeIndices.push_back((uint)sparseIndex+1); // + 1 for numbering shift between GridGenerator and VF_GPU
+    }
+
+    // If splitDomain: find fluidNodeIndicesBorder and remove all indices in fluidNodeIndicesBorder from fluidNodeIndices
+    if (splitDomain) {
+        findFluidNodeIndicesBorder();
+        std::sort(this->fluidNodeIndices.begin(), this->fluidNodeIndices.end());
+        auto iterator = std::set_difference(this->fluidNodeIndices.begin(), this->fluidNodeIndices.end(),
+                            this->fluidNodeIndicesBorder.begin(), this->fluidNodeIndicesBorder.end(),
+                            this->fluidNodeIndices.begin());
+        this->fluidNodeIndices.resize(iterator - this->fluidNodeIndices.begin());
+    }
+}
+
+void GridImp::findFluidNodeIndicesBorder() {
+    this->fluidNodeIndicesBorder.clear();
+
+    // resize fluidNodeIndicesBorder (for better performance in copy operation)
+    size_t newSize = 0;
+    for (CommunicationIndices& ci : this->communicationIndices)
+        newSize += ci.sendIndices.size();    
+    this->fluidNodeIndicesBorder.reserve(newSize);
+
+    // copy all send indices to fluidNodeIndicesBorder
+    for (CommunicationIndices& ci : this->communicationIndices)
+        std::copy(ci.sendIndices.begin(), ci.sendIndices.end(), std::back_inserter(this->fluidNodeIndicesBorder));
+
+    // remove duplicate elements
+    std::sort(this->fluidNodeIndicesBorder.begin(), this->fluidNodeIndicesBorder.end());
+    this->fluidNodeIndicesBorder.erase(
+        std::unique(this->fluidNodeIndicesBorder.begin(), this->fluidNodeIndicesBorder.end()),
+        this->fluidNodeIndicesBorder.end());
+
+    // + 1 for numbering shift between GridGenerator and VF_GPU
+    for (size_t i = 0; i < this->fluidNodeIndicesBorder.size(); i++)
+        this->fluidNodeIndicesBorder[i] = this->getSparseIndex(this->fluidNodeIndicesBorder[i])+1;
+}
+
 void GridImp::setNeighborIndices(uint index)
 {
     real x, y, z;
@@ -1638,6 +1686,27 @@ void GridImp::findCommunicationIndex( uint index, real coordinate, real limit, i
 	}
 }
 
+bool GridImp::isSendNode(int index) const
+{
+    bool isSendNode = false;
+    for (size_t direction = 0; direction < this->communicationIndices.size(); direction++)
+        if (std::find(this->communicationIndices[direction].sendIndices.begin(),
+                      this->communicationIndices[direction].sendIndices.end(), index) != this->communicationIndices[direction].sendIndices.end())
+            isSendNode = true;
+    return isSendNode;
+}
+
+bool GridImp::isReceiveNode(int index) const
+{
+    bool isReceiveNode = false;
+    for (size_t direction = 0; direction < this->communicationIndices.size(); direction++)
+        if (std::find(this->communicationIndices[direction].receiveIndices.begin(),
+                      this->communicationIndices[direction].receiveIndices.end(),
+                      index) != this->communicationIndices[direction].receiveIndices.end())
+            isReceiveNode = true;
+    return isReceiveNode;
+}
+
 uint GridImp::getNumberOfSendNodes(int direction)
 {
     return (uint)this->communicationIndices[direction].sendIndices.size();
@@ -1658,7 +1727,7 @@ uint GridImp::getReceiveIndex(int direction, uint index)
     return this->communicationIndices[direction].receiveIndices[ index ];
 }
 
-void GridImp::repairCommunicationInices(int direction )
+void GridImp::repairCommunicationIndices(int direction)
 {
     this->communicationIndices[direction].sendIndices.insert( this->communicationIndices[direction].sendIndices.end(), 
                                                               this->communicationIndices[direction+1].sendIndices.begin(), 
@@ -1799,7 +1868,11 @@ uint GridImp::getSize() const
 
 uint GridImp::getSparseSize() const
 {
-    return this->sparseSize;
+    return this->sparseSize; 
+}
+
+uint GridImp::getNumberOfFluidNodes() const { 
+    return (uint)this->fluidNodeIndices.size(); 
 }
 
 Field GridImp::getField() const
@@ -1942,6 +2015,12 @@ void GridImp::getGridInterface(uint* gridInterfaceList, const uint* oldGridInter
         gridInterfaceList[i] = oldGridInterfaceList[i] + 1; // + 1 for numbering shift between GridGenerator and VF_GPU
 }
 
+bool GridImp::isSparseIndexInFluidNodeIndicesBorder(uint &sparseIndex) const
+{
+    return std::find(this->fluidNodeIndicesBorder.begin(), this->fluidNodeIndicesBorder.end(), sparseIndex) !=
+           this->fluidNodeIndicesBorder.end();
+}
+
 #define GEOFLUID 19
 #define GEOSOLID 16
 
@@ -1956,7 +2035,7 @@ void GridImp::getNodeValues(real *xCoords, real *yCoords, real *zCoords, uint *n
     geo[0] = GEOSOLID;
 
     int nodeNumber = 0;
-    for (uint i = 0; i < this->getSize(); i++)
+    for (uint i = 0; i < this->size; i++)
     {
         if (this->sparseIndices[i] == -1)
             continue;
@@ -1986,10 +2065,27 @@ void GridImp::getNodeValues(real *xCoords, real *yCoords, real *zCoords, uint *n
     }
 }
 
+void GridImp::getFluidNodeIndices(uint *fluidNodeIndices) const 
+{ 
+    for (uint nodeNumber = 0; nodeNumber < (uint)this->fluidNodeIndices.size(); nodeNumber++)
+        fluidNodeIndices[nodeNumber] = this->fluidNodeIndices[nodeNumber];
+}
+
+uint GridImp::getNumberOfFluidNodesBorder() const 
+{ 
+    return (uint)this->fluidNodeIndicesBorder.size(); 
+}
+
+void GridImp::getFluidNodeIndicesBorder(uint *fluidNodeIndicesBorder) const 
+{
+    for (uint nodeNumber = 0; nodeNumber < (uint)this->fluidNodeIndicesBorder.size(); nodeNumber++)
+        fluidNodeIndicesBorder[nodeNumber] = this->fluidNodeIndicesBorder[nodeNumber];
+}
+
 void GridImp::print() const
 {
     printf("min: (%2.4f, %2.4f, %2.4f), max: (%2.4f, %2.4f, %2.4f), size: %d, delta: %2.4f\n", startX, startY, startZ,
            endX, endY, endZ, size, delta);
     if(this->gridInterface)
         this->gridInterface->print();
-}
\ No newline at end of file
+}
diff --git a/src/gpu/GridGenerator/grid/GridImp.h b/src/gpu/GridGenerator/grid/GridImp.h
index 9a8c209d2e6d3113c44482473ef591e0af4cb44d..ee30e2b4aaadd737e1fa096eec3b815768ddd0a0 100644
--- a/src/gpu/GridGenerator/grid/GridImp.h
+++ b/src/gpu/GridGenerator/grid/GridImp.h
@@ -37,12 +37,12 @@
 
 #include "Core/LbmOrGks.h"
 
-#include "global.h"
+#include "gpu/GridGenerator/global.h"
 
-#include "grid/distributions/Distribution.h"
-#include "grid/Grid.h"
-#include "grid/Cell.h"
-#include "grid/Field.h" 
+#include "gpu/GridGenerator/grid/distributions/Distribution.h"
+#include "gpu/GridGenerator/grid/Grid.h"
+#include "gpu/GridGenerator/grid/Cell.h"
+#include "gpu/GridGenerator/grid/Field.h" 
 
 class TriangularMesh;
 struct Vertex;
@@ -70,7 +70,7 @@ extern int DIRECTIONS[DIR_END_MAX][DIMENSION];
 
 class GRIDGENERATOR_EXPORT GridImp : public enableSharedFromThis<GridImp>, public Grid
 {
-private:
+protected:
     GridImp() = default;
     GridImp(Object* object, real startX, real startY, real startZ, real endX, real endY, real endZ, real delta, Distribution d, uint level);
 
@@ -116,6 +116,9 @@ private:
     int *neighborIndexX, *neighborIndexY, *neighborIndexZ, *neighborIndexNegative;
     int *sparseIndices;
 
+    std::vector<uint> fluidNodeIndices;
+    std::vector<uint> fluidNodeIndicesBorder;
+
 	uint *qIndices;     //maps from matrix index to qIndex
 	real *qValues;
     uint *qPatches;
@@ -249,6 +252,8 @@ public:
 
     static void getGridInterface(uint *gridInterfaceList, const uint *oldGridInterfaceList, uint size);
 
+    bool isSparseIndexInFluidNodeIndicesBorder(uint &sparseIndex) const override;
+
     int *getNeighborsX() const override;
     int* getNeighborsY() const override;
     int* getNeighborsZ() const override;
@@ -341,7 +346,20 @@ public:
     uint getSendIndex(int direction, uint index) override;
     uint getReceiveIndex(int direction, uint index) override;
 
-    void repairCommunicationInices(int direction) override;
+    bool isSendNode(int index) const override;
+    bool isReceiveNode(int index) const override;
+
+    void repairCommunicationIndices(int direction) override;
+
+    void findFluidNodeIndices(bool splitDomain) override;
+    void findFluidNodeIndicesBorder() override;
+
+    uint getNumberOfFluidNodes() const override;
+    void getFluidNodeIndices(uint *fluidNodeIndices) const override;
+
+    uint getNumberOfFluidNodesBorder() const override;
+    void getFluidNodeIndicesBorder(uint *fluidNodeIndicesBorder) const override;
+
 
 public:
     struct CommunicationIndices {
diff --git a/src/gpu/GridGenerator/grid/GridInterface.h b/src/gpu/GridGenerator/grid/GridInterface.h
index 303d79d4995ea04fe30b2a004c5738bf9c926cf2..b5f71317e7755a8b6bcfe3da084e0fc9155642f8 100644
--- a/src/gpu/GridGenerator/grid/GridInterface.h
+++ b/src/gpu/GridGenerator/grid/GridInterface.h
@@ -33,7 +33,7 @@
 #ifndef GRID_INTERFACE_H
 #define GRID_INTERFACE_H
 
-#include "global.h"
+#include "gpu/GridGenerator/global.h"
 
 class GridImp;
 
diff --git a/src/gpu/GridGenerator/grid/distributions/Distribution.h b/src/gpu/GridGenerator/grid/distributions/Distribution.h
index abc6d8105c9daeb83f4e31478a55942f48bf5e65..7982abc235b020003526b10c885f4936bf661936 100644
--- a/src/gpu/GridGenerator/grid/distributions/Distribution.h
+++ b/src/gpu/GridGenerator/grid/distributions/Distribution.h
@@ -36,7 +36,7 @@
 #include <vector>
 #include <string>
 
-#include "global.h"
+#include "gpu/GridGenerator/global.h"
 
 #define DIR_END_MAX 27
 
diff --git a/src/gpu/GridGenerator/io/GridVTKWriter/GridVTKWriter.cpp b/src/gpu/GridGenerator/io/GridVTKWriter/GridVTKWriter.cpp
index 2c8624732d70e52ac24d843888548bdaf5585686..35b3197ff7c3f37eb33809cc9a909f0085d2dffc 100644
--- a/src/gpu/GridGenerator/io/GridVTKWriter/GridVTKWriter.cpp
+++ b/src/gpu/GridGenerator/io/GridVTKWriter/GridVTKWriter.cpp
@@ -90,6 +90,8 @@ void GridVTKWriter::writeGridToVTKXML(SPtr<Grid> grid, const std::string& name,
         nodedatanames.push_back("types");
         nodedatanames.push_back("sparse_id");
         nodedatanames.push_back("matrix_id");
+        nodedatanames.push_back("isSendNode");
+        nodedatanames.push_back("isReceiveNode");
 
         nodedata.resize(nodedatanames.size());
 
@@ -117,6 +119,8 @@ void GridVTKWriter::writeGridToVTKXML(SPtr<Grid> grid, const std::string& name,
                     nodedata[0].push_back(type);
                     nodedata[1].push_back(grid->getSparseIndex(index));
                     nodedata[2].push_back(index);
+                    nodedata[3].push_back(grid->isSendNode(index));
+                    nodedata[4].push_back(grid->isReceiveNode(index));
                 }
             }
         }
diff --git a/src/gpu/GridGenerator/io/STLReaderWriter/STLReader.cpp b/src/gpu/GridGenerator/io/STLReaderWriter/STLReader.cpp
index 173f79c184c0a455ffd5b27cae59e07fa6dd4fa6..d3eb221265b2f8c79d5aece8729585733c2d60e8 100644
--- a/src/gpu/GridGenerator/io/STLReaderWriter/STLReader.cpp
+++ b/src/gpu/GridGenerator/io/STLReaderWriter/STLReader.cpp
@@ -118,7 +118,7 @@ std::vector<Triangle> STLReader::readASCIISTLWithPatches(const std::string& name
     std::ifstream file;
     file.open(name.c_str(), std::ifstream::in);
 
-    if( !file.is_open() ) throw std::runtime_error(name + "cannot be opened!");
+    if( !file.is_open() ) throw std::runtime_error(name + " cannot be opened!");
 
     uint currentPatchIndex = 0;
 
diff --git a/src/gpu/GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h b/src/gpu/GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h
index d5d2a377b33697704b86f8b78987fd0af75be415..4a4552f74b69949865e233014d74ac7168b36b31 100644
--- a/src/gpu/GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h
+++ b/src/gpu/GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h
@@ -42,7 +42,7 @@
 
 #include "Core/NonCreatable.h"
 
-#include "global.h"
+#include "gpu/GridGenerator/global.h"
 
 class UnstructuredGridBuilder;
 class GridBuilder;
diff --git a/src/gpu/VirtualFluids_GPU/CMakeLists.txt b/src/gpu/VirtualFluids_GPU/CMakeLists.txt
index 3e40d54b4c4b4f188e6efb6a0b5bb3d2ed6bc3a2..53707fa381228e4d2ca380e3ba16f5bf0e5f2d38 100644
--- a/src/gpu/VirtualFluids_GPU/CMakeLists.txt
+++ b/src/gpu/VirtualFluids_GPU/CMakeLists.txt
@@ -19,4 +19,7 @@ vf_add_tests()
 if(BUILD_VF_UNIT_TESTS)
     set_target_properties(VirtualFluids_GPUTests PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
     set_source_files_properties(Kernel/Utilities/DistributionHelperTests.cpp PROPERTIES LANGUAGE CUDA)
+	set_source_files_properties(DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreamsTest.cpp PROPERTIES LANGUAGE CUDA)
+    set_source_files_properties(Communication/ExchangeData27Test.cpp PROPERTIES LANGUAGE CUDA)
+    target_include_directories(VirtualFluids_GPUTests PRIVATE "${VF_THIRD_DIR}/cuda_samples/")
 endif()
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.cpp b/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8082a554963f01637295674d846397ceef34eeb2
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.cpp
@@ -0,0 +1,182 @@
+//  _    ___      __              __________      _     __        ______________   __
+// | |  / (_)____/ /___  ______ _/ / ____/ /_  __(_)___/ /____   /  ___/ __  / /  / /
+// | | / / / ___/ __/ / / / __ `/ / /_  / / / / / / __  / ___/  / /___/ /_/ / /  / /
+// | |/ / / /  / /_/ /_/ / /_/ / / __/ / / /_/ / / /_/ (__  )  / /_) / ____/ /__/ / 
+// |___/_/_/   \__/\__,_/\__,_/_/_/   /_/\__,_/_/\__,_/____/   \____/_/    \_____/
+//
+//////////////////////////////////////////////////////////////////////////
+#include "Calculation/CalcTurbulenceIntensity.h"
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#include <basics/Core/StringUtilities/StringUtil.h>
+
+void allocTurbulenceIntensity(Parameter *para, CudaMemoryManager *cudaManager)
+{
+    for (int lev=para->getCoarse(); lev <= para->getFine(); lev++) {
+        cudaManager->cudaAllocTurbulenceIntensity(lev, para->getParH(lev)->size_Mat_SP);
+        para->getParH(lev)->turbulenceIntensity.resize(para->getParH(lev)->size_Mat_SP);    
+    }
+        resetVelocityFluctuationsAndMeans(para, cudaManager);
+}
+
+
+void calcVelocityAndFluctuations(Parameter *para, CudaMemoryManager *cudaManager, uint tdiff)
+{
+    for (int lev = para->getCoarse(); lev <= para->getFine(); lev++) {
+        cudaManager->cudaCopyTurbulenceIntensityDH(lev, para->getParH(lev)->size_Mat_SP);
+
+        for (uint i = 0; i < para->getParH(lev)->size_Mat_SP; i++) {
+            // mean velocity
+            para->getParH(lev)->vx_mean[i] = para->getParH(lev)->vx_mean[i] / (real)tdiff;
+            para->getParH(lev)->vy_mean[i] = para->getParH(lev)->vy_mean[i] / (real)tdiff;
+            para->getParH(lev)->vz_mean[i] = para->getParH(lev)->vz_mean[i] / (real)tdiff;
+
+            // fluctuations
+            para->getParH(lev)->vxx[i] = para->getParH(lev)->vxx[i] / (real)tdiff;
+            para->getParH(lev)->vyy[i] = para->getParH(lev)->vyy[i] / (real)tdiff;
+            para->getParH(lev)->vzz[i] = para->getParH(lev)->vzz[i] / (real)tdiff;
+            para->getParH(lev)->vxy[i] = para->getParH(lev)->vxy[i] / (real)tdiff;
+            para->getParH(lev)->vxz[i] = para->getParH(lev)->vxz[i] / (real)tdiff;
+            para->getParH(lev)->vyz[i] = para->getParH(lev)->vyz[i] / (real)tdiff;
+
+            para->getParH(lev)->vxx[i] =
+                para->getParH(lev)->vxx[i] - para->getParH(lev)->vx_mean[i] * para->getParH(lev)->vx_mean[i];
+            para->getParH(lev)->vyy[i] =
+                para->getParH(lev)->vyy[i] - para->getParH(lev)->vy_mean[i] * para->getParH(lev)->vy_mean[i];
+            para->getParH(lev)->vzz[i] =
+                para->getParH(lev)->vzz[i] - para->getParH(lev)->vz_mean[i] * para->getParH(lev)->vz_mean[i];
+            para->getParH(lev)->vxy[i] =
+                para->getParH(lev)->vxy[i] - para->getParH(lev)->vx_mean[i] * para->getParH(lev)->vy_mean[i];
+            para->getParH(lev)->vxz[i] =
+                para->getParH(lev)->vxz[i] - para->getParH(lev)->vx_mean[i] * para->getParH(lev)->vz_mean[i];
+            para->getParH(lev)->vyz[i] =
+                para->getParH(lev)->vyz[i] - para->getParH(lev)->vy_mean[i] * para->getParH(lev)->vz_mean[i];
+        }
+    }
+}
+
+
+void calcTurbulenceIntensity(Parameter *para, CudaMemoryManager *cudaManager, uint tdiff) {
+    
+
+    real fluc_squared;
+    real v_mean_squared;
+
+    for (int lev = para->getCoarse(); lev <= para->getFine(); lev++) {
+    calcVelocityAndFluctuations(para, cudaManager, tdiff);
+
+        for (uint i = 0; i < para->getParH(lev)->size_Mat_SP; i++) {
+            fluc_squared = (real)(
+                1.0 / 3.0 * (para->getParH(lev)->vxx[i] + para->getParH(lev)->vyy[i] + para->getParH(lev)->vzz[i]));
+            v_mean_squared = para->getParH(lev)->vx_mean[i] * para->getParH(lev)->vx_mean[i] +
+                             para->getParH(lev)->vy_mean[i] * para->getParH(lev)->vy_mean[i] +
+                             para->getParH(lev)->vz_mean[i] * para->getParH(lev)->vz_mean[i];
+            para->getParH(lev)->turbulenceIntensity[i] = (real)sqrt(fluc_squared / v_mean_squared);
+        }
+    }
+}
+
+
+void resetVelocityFluctuationsAndMeans(Parameter *para, CudaMemoryManager *cudaManager)
+{
+    for (int lev = para->getCoarse(); lev <= para->getFine(); lev++) {
+        for (unsigned int i = 0; i < para->getParH(lev)->size_Mat_SP; i++) {
+            para->getParH(lev)->vxx[i]     = (real)0.0;
+            para->getParH(lev)->vyy[i]     = (real)0.0;
+            para->getParH(lev)->vzz[i]     = (real)0.0;
+            para->getParH(lev)->vxy[i]     = (real)0.0;
+            para->getParH(lev)->vxz[i]     = (real)0.0;
+            para->getParH(lev)->vyz[i]     = (real)0.0;
+            para->getParH(lev)->vx_mean[i] = (real)0.0;
+            para->getParH(lev)->vy_mean[i] = (real)0.0;
+            para->getParH(lev)->vz_mean[i] = (real)0.0;
+        }
+
+        cudaManager->cudaCopyTurbulenceIntensityHD(lev, para->getParH(lev)->size_Mat_SP);
+    }
+}
+
+void cudaFreeTurbulenceIntensityArrays(Parameter *para, CudaMemoryManager *cudaManager)
+{
+    for (int lev = para->getCoarse(); lev <= para->getFine(); lev++) {
+        cudaManager->cudaFreeTurbulenceIntensity(lev);
+    }
+}
+
+void writeTurbulenceIntensityToFile(Parameter *para, uint timestep)
+{
+    for (int lev = para->getCoarse(); lev <= para->getFine(); lev++) {
+        std::vector<real *> data           = { para->getParH(lev)->turbulenceIntensity.data() };
+        std::vector<std::string> datanames = { "ti" };
+        writeTiStuffToFile(para, timestep, para->getParH(lev)->size_Mat_SP, data, datanames);
+    }
+}
+
+void writeVeloFluctuationToFile(Parameter *para, uint timestep) 
+{
+    for (int lev = para->getCoarse(); lev <= para->getFine(); lev++) {
+        std::vector<real *> data = { para->getParH(lev)->vxx, para->getParH(lev)->vyy, para->getParH(lev)->vzz };
+        std::vector<std::string> datanames = { "vxx", "vyy", "vzz" };
+        writeTiStuffToFile(para, timestep, para->getParH(lev)->size_Mat_SP, data, datanames);
+    }
+}
+
+void writeVeloMeansToFile(Parameter *para, uint timestep) {
+    for (int lev = para->getCoarse(); lev <= para->getFine(); lev++) {
+        std::vector<real *> data           = { para->getParH(lev)->vx_mean, 
+                                               para->getParH(lev)->vy_mean,
+                                               para->getParH(lev)->vz_mean };
+        std::vector<std::string> datanames = { "vx_mean", "vy_mean", "vz_mean" };
+        writeTiStuffToFile(para, timestep, para->getParH(lev)->size_Mat_SP, data, datanames);
+    }
+}
+
+void writeAllTiDatafToFile(Parameter *para, uint timestep)
+{
+    for (int lev = para->getCoarse(); lev <= para->getFine(); lev++) {
+        std::vector<real *> data = { para->getParH(lev)->vxx,
+                                     para->getParH(lev)->vyy,
+                                     para->getParH(lev)->vzz,
+                                     para->getParH(lev)->vx_mean,
+                                     para->getParH(lev)->vy_mean,
+                                     para->getParH(lev)->vz_mean,
+                                     para->getParH(lev)->turbulenceIntensity.data() };
+        std::vector<std::string> datanames = { "vxx", "vyy", "vzz", "vx_mean", "vy_mean", "vz_mean", "ti" };
+        writeTiStuffToFile(para, timestep, para->getParH(lev)->size_Mat_SP, data, datanames);
+    }
+}
+
+void writeTiStuffToFile(Parameter *para, uint timestep, int sizeOfTiArray, std::vector<real *> &data,
+                        std::vector<std::string> &datanames)
+{
+    ////////////////////////////////////////////////////////////////////////
+    // set filename
+    std::string names;
+    std::for_each(datanames.begin(), datanames.end(), [&names](const std::string &s) { return names += "_" + s; });
+    std::string ffname = para->getFName() + StringUtil::toString<int>(para->getMyID()) + "_" +
+                         StringUtil::toString<int>(timestep) + names + "_ti.txt";
+    const char *fname = ffname.c_str();
+    ////////////////////////////////////////////////////////////////////////
+    // set ofstream
+    std::ofstream ostr;
+    ////////////////////////////////////////////////////////////////////////
+    // open file
+    ostr.open(fname);
+    ////////////////////////////////////////////////////////////////////////
+    // add header
+    ostr << "index_sp";
+        for (auto name : datanames) ostr << "\t" << name;
+    ostr << std::endl;
+    ////////////////////////////////////////////////////////////////////////
+    // fill file with data
+    for (int i = 0; i < sizeOfTiArray; i++) {
+        ostr << i;
+        for (auto dataset : data)
+            ostr << "\t" << dataset[i];
+        ostr << std::endl;
+    }
+    ////////////////////////////////////////////////////////////////////////
+    // close file
+    ostr.close();
+    ////////////////////////////////////////////////////////////////////////
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.h b/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a2d539f3ae31f3975d03cbc0ea73dad90c20f73
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Calculation/CalcTurbulenceIntensity.h
@@ -0,0 +1,24 @@
+#ifndef CalcTurbulenceIntensity_H
+#define CalcTurbulenceIntensity_H
+
+#include "LBM/LB.h"
+#include "GPU/GPU_Interface.h"
+#include "Parameter/Parameter.h"
+#include "GPU/CudaMemoryManager.h"
+
+extern "C" void allocTurbulenceIntensity(Parameter *para, CudaMemoryManager *cudaManager);
+extern "C" void calcVelocityAndFluctuations(Parameter *para, CudaMemoryManager *cudaManager, uint tdiff);
+extern "C" void calcTurbulenceIntensity(Parameter *para, CudaMemoryManager *cudaManager, uint tdiff);
+extern "C" void resetVelocityFluctuationsAndMeans(Parameter *para, CudaMemoryManager *cudaManager);
+extern "C" void cudaFreeTurbulenceIntensityArrays(Parameter *para, CudaMemoryManager *cudaManager);
+
+
+void writeTurbulenceIntensityToFile(Parameter *para, uint timestep);
+void writeVeloFluctuationToFile(Parameter *para, uint timeste);
+void writeVeloMeansToFile(Parameter *para, uint timestep);
+void writeAllTiDatafToFile(Parameter *para, uint timestep);
+
+void writeTiStuffToFile(Parameter *para, uint timestep, int sizeOfTiArray, std::vector<real *> &data,
+                  std::vector<std::string> &datanames);
+
+#endif
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
index 17d01e57e4c34894e0e0551dd7443dfe92582240..0cae176e48042b112480a8a718b4060a2e7396ca 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
@@ -7,67 +7,179 @@
 //#include "Output/UnstructuredGridWriter.hpp"
 #include "Communication/ExchangeData27.h"
 #include "Kernel/Kernel.h"
+#include "Parameter/CudaStreamManager.h"
 #include "GPU/TurbulentViscosity.h"
 
-void updateGrid27(Parameter* para, 
-                  vf::gpu::Communicator& comm, 
-                  CudaMemoryManager* cudaManager, 
-                  std::vector<std::shared_ptr<PorousMedia>>& pm, 
-                  int level, 
-                  unsigned int t, 
-                  std::vector < SPtr< Kernel>>& kernels)
+void UpdateGrid27::updateGrid(int level, unsigned int t)
 {
     //////////////////////////////////////////////////////////////////////////
-    
-    if( level != para->getFine() )
-    {
-        updateGrid27(para, comm, cudaManager, pm, level+1, t, kernels);
-        updateGrid27(para, comm, cudaManager, pm, level+1, t, kernels);
+
+    if (level != para->getFine()) {
+        updateGrid(level + 1, t);
+        updateGrid(level + 1, t);
     }
 
     //////////////////////////////////////////////////////////////////////////
-    
-    collision(para, pm, level, t, kernels);
-    
-    //////////////////////////////////////////////////////////////////////////
-    
-    exchangeMultiGPU(para, comm, cudaManager, level);
-    
+
+    (this->*collisionAndExchange)(level, t);
+
     //////////////////////////////////////////////////////////////////////////
-    
-    postCollisionBC(para, level, t);
-    
+
+    postCollisionBC(para.get(), level, t);
+
     //////////////////////////////////////////////////////////////////////////
 
-    swapBetweenEvenAndOddTimestep(para, level);
+    swapBetweenEvenAndOddTimestep(para.get(), level);
 
 	//////////////////////////////////////////////////////////////////////////
-    
-    if (para->getUseWale())
-		calcMacroscopicQuantities(para, level);
+
+	if (para->getUseWale())
+		calcMacroscopicQuantities(para.get(), level);
 
     if (para->getUseTurbulentViscosity())
-        calcTurbulentViscosity(para, level);
-    
-    //////////////////////////////////////////////////////////////////////////
-    
-    preCollisionBC(para, cudaManager, level, t);
-    
+        calcTurbulentViscosity(para.get(), level);
+
+	//////////////////////////////////////////////////////////////////////////
+
+    preCollisionBC(para.get(), cudaManager.get(), level, t);
+
     //////////////////////////////////////////////////////////////////////////
-    
     if( level != para->getFine() )
     {
-        fineToCoarse(para, level);
-
-        exchangeMultiGPU(para, comm, cudaManager, level);
-
-        coarseToFine(para, level);
+        (this->*refinementAndExchange)(level);
     }
+        
+    interactWithActuators(para.get(), cudaManager.get(), level, t);
     
-    interactWithActuators(para, cudaManager, level, t);
-    
-    interactWithProbes(para, cudaManager, level, t);
-    //////////////////////////////////////////////////////////////////////////
+    interactWithProbes(para.get(), cudaManager.get(), level, t);
+}
+
+void UpdateGrid27::refinementAndExchange_noRefinementAndExchange(int level) {}
+
+void UpdateGrid27::refinementAndExchange_streams_onlyExchangeInterface(int level)
+{
+    int borderStreamIndex = para->getStreamManager()->getBorderStreamIndex();
+    int bulkStreamIndex   = para->getStreamManager()->getBulkStreamIndex();
+
+    // fine to coarse border
+    fineToCoarseWithStream(para.get(), level, para->getParD(level)->intFCBorder.ICellFCC,
+                           para->getParD(level)->intFCBorder.ICellFCF, para->getParD(level)->intFCBorder.kFC,
+                           borderStreamIndex);
+
+    // prepare exchange and trigger bulk kernel when finished
+    prepareExchangeMultiGPUAfterFtoC(para.get(), level, borderStreamIndex);
+    if (para->getUseStreams())
+        para->getStreamManager()->triggerStartBulkKernel(borderStreamIndex);
+
+    // launch bulk kernels (f to c and c to f)
+    para->getStreamManager()->waitOnStartBulkKernelEvent(bulkStreamIndex);
+    fineToCoarseWithStream(para.get(), level, para->getParD(level)->intFCBulk.ICellFCC,
+                           para->getParD(level)->intFCBulk.ICellFCF, para->getParD(level)->intFCBulk.kFC,
+                           bulkStreamIndex);
+    coarseToFineWithStream(para.get(), level, para->getParD(level)->intCFBulk.ICellCFC,
+                           para->getParD(level)->intCFBulk.ICellCFF, para->getParD(level)->intCFBulk.kCF, para->getParD(level)->offCFBulk,
+                           bulkStreamIndex);
+
+    // exchange
+    exchangeMultiGPUAfterFtoC(para.get(), comm, cudaManager.get(), level, borderStreamIndex);
+
+    // coarse to fine border
+    coarseToFineWithStream(para.get(), level, para->getParD(level)->intCFBorder.ICellCFC,
+                           para->getParD(level)->intCFBorder.ICellCFF, para->getParD(level)->intCFBorder.kCF, para->getParD(level)->offCF,
+                           borderStreamIndex);
+    cudaDeviceSynchronize(); 
+}
+
+void UpdateGrid27::refinementAndExchange_streams_completeExchange(int level)
+{
+    int borderStreamIndex = para->getStreamManager()->getBorderStreamIndex();
+    int bulkStreamIndex   = para->getStreamManager()->getBulkStreamIndex();
+
+    // fine to coarse border
+    fineToCoarseWithStream(para.get(), level, para->getParD(level)->intFCBorder.ICellFCC,
+                           para->getParD(level)->intFCBorder.ICellFCF, para->getParD(level)->intFCBorder.kFC,
+                           borderStreamIndex);
+
+    // prepare exchange and trigger bulk kernel when finished
+    prepareExchangeMultiGPU(para.get(), level, borderStreamIndex);
+    if (para->getUseStreams())
+        para->getStreamManager()->triggerStartBulkKernel(borderStreamIndex);
+
+    // launch bulk kernels (f to c and c to f)
+    para->getStreamManager()->waitOnStartBulkKernelEvent(bulkStreamIndex);
+    fineToCoarseWithStream(para.get(), level, para->getParD(level)->intFCBulk.ICellFCC,
+                           para->getParD(level)->intFCBulk.ICellFCF, para->getParD(level)->intFCBulk.kFC,
+                           bulkStreamIndex);
+    coarseToFineWithStream(para.get(), level, para->getParD(level)->intCFBulk.ICellCFC,
+                           para->getParD(level)->intCFBulk.ICellCFF, para->getParD(level)->intCFBulk.kCF, para->getParD(level)->offCFBulk,
+                           bulkStreamIndex);
+
+    // exchange
+    exchangeMultiGPU(para.get(), comm, cudaManager.get(), level, borderStreamIndex);
+
+    // coarse to fine border
+    coarseToFineWithStream(para.get(), level, para->getParD(level)->intCFBorder.ICellCFC,
+                           para->getParD(level)->intCFBorder.ICellCFF, para->getParD(level)->intCFBorder.kCF, para->getParD(level)->offCF,
+                           borderStreamIndex);
+    cudaDeviceSynchronize(); 
+}
+
+void UpdateGrid27::refinementAndExchange_noStreams_onlyExchangeInterface(int level)
+{
+    fineToCoarse(para.get(), level);
+
+    exchangeMultiGPU_noStreams_withPrepare(para.get(), comm, cudaManager.get(), level, true);
+
+    coarseToFine(para.get(), level);
+}
+
+void UpdateGrid27::refinementAndExchange_noStreams_completeExchange(int level)
+{
+    fineToCoarse(para.get(), level);
+
+    exchangeMultiGPU_noStreams_withPrepare(para.get(), comm, cudaManager.get(), level, false);
+
+    coarseToFine(para.get(), level);
+}
+
+void UpdateGrid27::refinementAndExchange_noExchange(int level)
+{
+    fineToCoarse(para.get(), level);
+    coarseToFine(para.get(), level);
+}
+
+void UpdateGrid27::collisionAndExchange_noStreams_indexKernel(int level, unsigned int t)
+{
+    collisionUsingIndex(para.get(), pm, level, t, kernels, para->getParD(level)->fluidNodeIndices,
+                            para->getParD(level)->numberOfFluidNodes, -1);
+    exchangeMultiGPU_noStreams_withPrepare(para.get(), comm, cudaManager.get(), level, false);
+}
+
+void UpdateGrid27::collisionAndExchange_noStreams_oldKernel(int level, unsigned int t)
+{
+    collision(para.get(), pm, level, t, kernels);
+    exchangeMultiGPU_noStreams_withPrepare(para.get(), comm, cudaManager.get(), level, false);
+}
+
+void UpdateGrid27::collisionAndExchange_streams(int level, unsigned int t)
+{
+    int borderStreamIndex = para->getStreamManager()->getBorderStreamIndex();
+    int bulkStreamIndex   = para->getStreamManager()->getBulkStreamIndex();
+    // launch border kernel
+    collisionUsingIndex(para.get(), pm, level, t, kernels, para->getParD(level)->fluidNodeIndicesBorder,
+                        para->getParD(level)->numberOffluidNodesBorder, borderStreamIndex);
+
+    // prepare exchange and trigger bulk kernel when finished
+    prepareExchangeMultiGPU(para.get(), level, borderStreamIndex);
+    if (para->getUseStreams())
+        para->getStreamManager()->triggerStartBulkKernel(borderStreamIndex);
+
+    // launch bulk kernel
+    para->getStreamManager()->waitOnStartBulkKernelEvent(bulkStreamIndex);
+    collisionUsingIndex(para.get(), pm, level, t, kernels, para->getParD(level)->fluidNodeIndices,
+                        para->getParD(level)->numberOfFluidNodes, bulkStreamIndex);
+
+    exchangeMultiGPU(para.get(), comm, cudaManager.get(), level, borderStreamIndex);
 }
 
 void collision(Parameter* para, std::vector<std::shared_ptr<PorousMedia>>& pm, int level, unsigned int t, std::vector < SPtr< Kernel>>& kernels)
@@ -85,6 +197,26 @@ void collision(Parameter* para, std::vector<std::shared_ptr<PorousMedia>>& pm, i
         collisionAdvectionDiffusion(para, level);
 }
 
+void collisionUsingIndex(Parameter *para, std::vector<std::shared_ptr<PorousMedia>> &pm, int level, unsigned int t,
+                         std::vector<SPtr<Kernel>> &kernels, uint *fluidNodeIndices, uint numberOfFluidNodes, int stream)
+{
+    if (fluidNodeIndices != nullptr && numberOfFluidNodes != 0)
+        kernels.at(level)->runOnIndices(fluidNodeIndices, numberOfFluidNodes, stream);
+    else
+        std::cout << "In collision: fluidNodeIndices or numberOfFluidNodes not definded"
+                      << std::endl;
+
+    //////////////////////////////////////////////////////////////////////////
+
+    if (para->getSimulatePorousMedia())
+        collisionPorousMedia(para, pm, level);
+
+    //////////////////////////////////////////////////////////////////////////
+
+    if (para->getDiffOn())
+        collisionAdvectionDiffusion(para, level);
+}
+
 void collisionPorousMedia(Parameter* para, std::vector<std::shared_ptr<PorousMedia>>& pm, int level)
 {
     for( std::size_t i = 0; i < pm.size(); i++ )
@@ -158,35 +290,115 @@ void collisionAdvectionDiffusion(Parameter* para, int level)
 	}
 }
 
-void exchangeMultiGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
+void prepareExchangeMultiGPU(Parameter *para, int level, int streamIndex)
 {
-    if (para->getNumprocs() > 1)
-	{
-        // St. Lenz: exchange for post-collision data and pre-collision data are identical!
+    prepareExchangeCollDataXGPU27AllNodes(para, level, streamIndex);
+    prepareExchangeCollDataYGPU27AllNodes(para, level, streamIndex);
+    prepareExchangeCollDataZGPU27AllNodes(para, level, streamIndex);
+}
 
-		//////////////////////////////////////////////////////////////////////////
-		//3D domain decomposition
-		exchangePostCollDataXGPU27(para, comm, cudaManager, level);
-		exchangePostCollDataYGPU27(para, comm, cudaManager, level);
-		exchangePostCollDataZGPU27(para, comm, cudaManager, level);
+void prepareExchangeMultiGPUAfterFtoC(Parameter *para, int level, int streamIndex)
+{
+    prepareExchangeCollDataXGPU27AfterFtoC(para, level, streamIndex);
+    prepareExchangeCollDataYGPU27AfterFtoC(para, level, streamIndex);
+    prepareExchangeCollDataZGPU27AfterFtoC(para, level, streamIndex);
+}
 
-		//////////////////////////////////////////////////////////////////////////
-		//3D domain decomposition convection diffusion
-		if (para->getDiffOn()==true)
-		{
-			exchangePostCollDataADXGPU27(para, comm, cudaManager, level);
-			exchangePostCollDataADYGPU27(para, comm, cudaManager, level);
-			exchangePostCollDataADZGPU27(para, comm, cudaManager, level);
-		}
+void exchangeMultiGPU(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager, int level,
+                      int streamIndex)
+{
+    //////////////////////////////////////////////////////////////////////////
+    // 3D domain decomposition
+    exchangeCollDataXGPU27AllNodes(para, comm, cudaManager, level, streamIndex);
+    exchangeCollDataYGPU27AllNodes(para, comm, cudaManager, level, streamIndex);
+    exchangeCollDataZGPU27AllNodes(para, comm, cudaManager, level, streamIndex);
 
-        //////////////////////////////////////////////////////////////////////////
-        // D E P R E C A T E D
-        //////////////////////////////////////////////////////////////////////////
-		
-		//////////////////////////////////////////////////////////////////////////
-		//1D domain decomposition
-		//exchangePostCollDataGPU27(para, comm, level);
-	}
+    scatterNodesFromRecvBufferXGPU27AllNodes(para, level, streamIndex);
+    scatterNodesFromRecvBufferYGPU27AllNodes(para, level, streamIndex);
+    scatterNodesFromRecvBufferZGPU27AllNodes(para, level, streamIndex);
+
+    //////////////////////////////////////////////////////////////////////////
+    // 3D domain decomposition convection diffusion
+    if (para->getDiffOn()) {
+        if (para->getUseStreams())
+            std::cout << "Warning: Cuda streams not yet implemented for convection diffusion" << std::endl;
+        exchangePostCollDataADXGPU27(para, comm, cudaManager, level);
+        exchangePostCollDataADYGPU27(para, comm, cudaManager, level);
+        exchangePostCollDataADZGPU27(para, comm, cudaManager, level);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // D E P R E C A T E D
+    //////////////////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////////////////////
+    // 1D domain decomposition
+    // exchangePostCollDataGPU27(para, comm, level);
+}
+void exchangeMultiGPU_noStreams_withPrepare(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager, int level, bool useReducedComm)
+{
+    //////////////////////////////////////////////////////////////////////////
+    // 3D domain decomposition
+    if (useReducedComm) {
+        // X
+        prepareExchangeCollDataXGPU27AfterFtoC(para, level, -1);
+        exchangeCollDataXGPU27AfterFtoC(para, comm, cudaManager, level, -1);
+        scatterNodesFromRecvBufferXGPU27AfterFtoC(para, level, -1);
+        // Y
+        prepareExchangeCollDataYGPU27AfterFtoC(para, level, -1);
+        exchangeCollDataYGPU27AfterFtoC(para, comm, cudaManager, level, -1);
+        scatterNodesFromRecvBufferYGPU27AfterFtoC(para, level, -1);
+        // Z
+        prepareExchangeCollDataZGPU27AfterFtoC(para, level, -1);
+        exchangeCollDataZGPU27AfterFtoC(para, comm, cudaManager, level, -1);
+        scatterNodesFromRecvBufferZGPU27AfterFtoC(para, level, -1);  
+    } else {
+        // X
+        prepareExchangeCollDataXGPU27AllNodes(para, level, -1);
+        exchangeCollDataXGPU27AllNodes(para, comm, cudaManager, level, -1);
+        scatterNodesFromRecvBufferXGPU27AllNodes(para, level, -1);
+        // Y
+        prepareExchangeCollDataYGPU27AllNodes(para, level, -1);
+        exchangeCollDataYGPU27AllNodes(para, comm, cudaManager, level, -1);
+        scatterNodesFromRecvBufferYGPU27AllNodes(para, level, -1);
+        // Z
+        prepareExchangeCollDataZGPU27AllNodes(para, level, -1);
+        exchangeCollDataZGPU27AllNodes(para, comm, cudaManager, level, -1);
+        scatterNodesFromRecvBufferZGPU27AllNodes(para, level, -1);   
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // 3D domain decomposition convection diffusion
+    if (para->getDiffOn()) {
+        if (para->getUseStreams())
+            std::cout << "Warning: Cuda streams not yet implemented for convection diffusion" << std::endl;
+        exchangePostCollDataADXGPU27(para, comm, cudaManager, level);
+        exchangePostCollDataADYGPU27(para, comm, cudaManager, level);
+        exchangePostCollDataADZGPU27(para, comm, cudaManager, level);
+    }
+}
+void exchangeMultiGPUAfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager, int level,
+                               int streamIndex)
+{
+    //////////////////////////////////////////////////////////////////////////
+    // 3D domain decomposition
+    exchangeCollDataXGPU27AfterFtoC(para, comm, cudaManager, level, streamIndex);
+    exchangeCollDataYGPU27AfterFtoC(para, comm, cudaManager, level, streamIndex);
+    exchangeCollDataZGPU27AfterFtoC(para, comm, cudaManager, level, streamIndex);
+
+    scatterNodesFromRecvBufferXGPU27AfterFtoC(para, level, streamIndex);
+    scatterNodesFromRecvBufferYGPU27AfterFtoC(para, level, streamIndex);
+    scatterNodesFromRecvBufferZGPU27AfterFtoC(para, level, streamIndex);
+
+    //////////////////////////////////////////////////////////////////////////
+    // 3D domain decomposition convection diffusion
+    if (para->getDiffOn()) {
+        if (para->getUseStreams())
+            std::cout << "Warning: Cuda streams not yet implemented for convection diffusion" << std::endl;
+        exchangePostCollDataADXGPU27(para, comm, cudaManager, level);
+        exchangePostCollDataADYGPU27(para, comm, cudaManager, level);
+        exchangePostCollDataADZGPU27(para, comm, cudaManager, level);
+    }
 }
 
 void postCollisionBC(Parameter* para, int level, unsigned int t)
@@ -205,20 +417,20 @@ void postCollisionBC(Parameter* para, int level, unsigned int t)
         //           para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
         //getLastCudaError("QVelDev27 execution failed");
         
-        //QVelDevComp27( para->getParD(level)->numberofthreads, para->getParD(level)->nx,           para->getParD(level)->ny,
-        //               para->getParD(level)->Qinflow.Vx,      para->getParD(level)->Qinflow.Vy,   para->getParD(level)->Qinflow.Vz,
-        //               para->getParD(level)->d0SP.f[0],       para->getParD(level)->Qinflow.k,    para->getParD(level)->Qinflow.q27[0], 
-        //               para->getParD(level)->kInflowQ,        para->getParD(level)->kInflowQ,     para->getParD(level)->omega,
-        //               para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP, para->getParD(level)->neighborZ_SP,
-        //               para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
-        //getLastCudaError("QVelDevComp27 execution failed");
+        // QVelDevComp27( para->getParD(level)->numberofthreads, para->getParD(level)->nx,           para->getParD(level)->ny,
+        //                para->getParD(level)->Qinflow.Vx,      para->getParD(level)->Qinflow.Vy,   para->getParD(level)->Qinflow.Vz,
+        //                para->getParD(level)->d0SP.f[0],       para->getParD(level)->Qinflow.k,    para->getParD(level)->Qinflow.q27[0], 
+        //                para->getParD(level)->kInflowQ,        para->getParD(level)->kInflowQ,     para->getParD(level)->omega,
+        //                para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP, para->getParD(level)->neighborZ_SP,
+        //                para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
+        // getLastCudaError("QVelDevComp27 execution failed");
 
         QVelDevCompZeroPress27(para->getParD(level)->numberofthreads, para->getParD(level)->nx,             para->getParD(level)->ny,
-                               para->getParD(level)->Qinflow.Vx,      para->getParD(level)->Qinflow.Vy,     para->getParD(level)->Qinflow.Vz,
-                               para->getParD(level)->d0SP.f[0],       para->getParD(level)->Qinflow.k,      para->getParD(level)->Qinflow.q27[0],
-                               para->getParD(level)->kInflowQ,        para->getParD(level)->Qinflow.kArray, para->getParD(level)->omega,
-                               para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP,   para->getParD(level)->neighborZ_SP,
-                               para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
+                              para->getParD(level)->Qinflow.Vx,      para->getParD(level)->Qinflow.Vy,     para->getParD(level)->Qinflow.Vz,
+                              para->getParD(level)->d0SP.f[0],       para->getParD(level)->Qinflow.k,      para->getParD(level)->Qinflow.q27[0],
+                              para->getParD(level)->kInflowQ,        para->getParD(level)->Qinflow.kArray, para->getParD(level)->omega,
+                              para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP,   para->getParD(level)->neighborZ_SP,
+                              para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
         getLastCudaError("QVelDevCompZeroPress27 execution failed");
 
         //////////////////////////////////////////////////////////////////////////
@@ -380,21 +592,21 @@ void postCollisionBC(Parameter* para, int level, unsigned int t)
         //           para->getParD(level)->size_Mat_SP,           para->getParD(level)->evenOrOdd);
         //getLastCudaError("QDevComp27 (Geom) execution failed");
 
-        //QVelDevComp27(para->getParD(level)->numberofthreads, para->getParD(level)->nx,           para->getParD(level)->ny,
-        //              para->getParD(level)->QGeom.Vx,        para->getParD(level)->QGeom.Vy,     para->getParD(level)->QGeom.Vz,
-        //              para->getParD(level)->d0SP.f[0],       para->getParD(level)->QGeom.k,      para->getParD(level)->QGeom.q27[0], 
-        //              para->getParD(level)->QGeom.kQ,        para->getParD(level)->QGeom.kQ,     para->getParD(level)->omega,
-        //              para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP, para->getParD(level)->neighborZ_SP,
-        //              para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
-        //getLastCudaError("QVelDevComp27 execution failed");
-
-   		QVelDevCompZeroPress27(	para->getParD(0)->numberofthreads, para->getParD(0)->nx,           para->getParD(0)->ny,
-								para->getParD(0)->QGeom.Vx,        para->getParD(0)->QGeom.Vy,     para->getParD(0)->QGeom.Vz,
-								para->getParD(0)->d0SP.f[0],       para->getParD(0)->QGeom.k,      para->getParD(0)->QGeom.q27[0], 
-								para->getParD(0)->QGeom.kQ,        para->getParD(0)->QGeom.kQ,     para->getParD(0)->omega,
-								para->getParD(0)->neighborX_SP,    para->getParD(0)->neighborY_SP, para->getParD(0)->neighborZ_SP,
-								para->getParD(0)->size_Mat_SP,     para->getParD(0)->evenOrOdd);
-		getLastCudaError("QVelDevCompZeroPress27 execution failed");
+        QVelDevComp27(para->getParD(level)->numberofthreads, para->getParD(level)->nx,           para->getParD(level)->ny,
+                      para->getParD(level)->QGeom.Vx,        para->getParD(level)->QGeom.Vy,     para->getParD(level)->QGeom.Vz,
+                      para->getParD(level)->d0SP.f[0],       para->getParD(level)->QGeom.k,      para->getParD(level)->QGeom.q27[0], 
+                      para->getParD(level)->QGeom.kQ,        para->getParD(level)->QGeom.kQ,     para->getParD(level)->omega,
+                      para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP, para->getParD(level)->neighborZ_SP,
+                      para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
+        getLastCudaError("QVelDevComp27 execution failed");
+
+        //QVelDevCompZeroPress27(	para->getParD(0)->numberofthreads, para->getParD(0)->nx,           para->getParD(0)->ny,
+		//						para->getParD(0)->QGeom.Vx,        para->getParD(0)->QGeom.Vy,     para->getParD(0)->QGeom.Vz,
+		//						para->getParD(0)->d0SP.f[0],       para->getParD(0)->QGeom.k,      para->getParD(0)->QGeom.q27[0], 
+		//						para->getParD(0)->QGeom.kQ,        para->getParD(0)->QGeom.kQ,     para->getParD(0)->omega,
+		//						para->getParD(0)->neighborX_SP,    para->getParD(0)->neighborY_SP, para->getParD(0)->neighborZ_SP,
+		//						para->getParD(0)->size_Mat_SP,     para->getParD(0)->evenOrOdd);
+		//getLastCudaError("QVelDevCompZeroPress27 execution failed");
 
         //QDev3rdMomentsComp27( para->getParD(level)->numberofthreads,       para->getParD(level)->nx,           para->getParD(level)->ny,
         //                      para->getParD(level)->d0SP.f[0],             para->getParD(level)->QGeom.k,      para->getParD(level)->QGeom.q27[0], 
@@ -854,10 +1066,10 @@ void preCollisionBC(Parameter* para, CudaMemoryManager* cudaManager, int level,
 	if (para->getParD(level)->QPress.kQ > 0)
 	{
 		QPressNoRhoDev27(para->getParD(level)->numberofthreads, para->getParD(level)->QPress.RhoBC,
-		                 para->getParD(level)->d0SP.f[0],       para->getParD(level)->QPress.k,
-		                 para->getParD(level)->QPress.kN,       para->getParD(level)->QPress.kQ,     para->getParD(level)->omega,
-		                 para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP,  para->getParD(level)->neighborZ_SP,
-		                 para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
+		                para->getParD(level)->d0SP.f[0],       para->getParD(level)->QPress.k,
+		                para->getParD(level)->QPress.kN,       para->getParD(level)->QPress.kQ,     para->getParD(level)->omega,
+		                para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP,  para->getParD(level)->neighborZ_SP,
+		                para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
 		getLastCudaError("QPressNoRhoDev27 execution failed");
 
 		//QPressDevEQZ27(para->getParD(level)->numberofthreads, para->getParD(level)->QPress.RhoBC, 
@@ -885,12 +1097,12 @@ void preCollisionBC(Parameter* para, CudaMemoryManager* cudaManager, int level,
         //getLastCudaError("QPressDevIncompNEQ27 execution failed");
         //////////////////////////////////////////////////////////////////////////////////
         //press NEQ compressible
-        //QPressDevNEQ27( para->getParD(level)->numberofthreads, para->getParD(level)->QPress.RhoBC, 
-        //                para->getParD(level)->d0SP.f[0],       para->getParD(level)->QPress.k,  
-        //                para->getParD(level)->QPress.kN,       para->getParD(level)->QPress.kQ,    para->getParD(level)->omega,
-        //                para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP, para->getParD(level)->neighborZ_SP,
-        //                para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
-        //getLastCudaError("QPressDevNEQ27 execution failed");
+        // QPressDevNEQ27( para->getParD(level)->numberofthreads, para->getParD(level)->QPress.RhoBC, 
+        //                 para->getParD(level)->d0SP.f[0],       para->getParD(level)->QPress.k,  
+        //                 para->getParD(level)->QPress.kN,       para->getParD(level)->QPress.kQ,    para->getParD(level)->omega,
+        //                 para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP, para->getParD(level)->neighborZ_SP,
+        //                 para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
+        // getLastCudaError("QPressDevNEQ27 execution failed");
 
 	}
 
@@ -971,7 +1183,7 @@ void fineToCoarse(Parameter* para, int level)
 							para->getParD(level)->K_FC,           para->getParD(level)->omega,           para->getParD(level+1)->omega, 
 							para->getParD(level)->vis,            para->getParD(level)->nx,              para->getParD(level)->ny, 
 							para->getParD(level+1)->nx,           para->getParD(level+1)->ny,            para->getParD(level)->numberofthreads,
-							para->getParD(level)->offFC);
+							para->getParD(level)->offFC,          CU_STREAM_LEGACY);
     getLastCudaError("ScaleFC27_RhoSq_comp execution failed");
 
 	//ScaleFC_AA2016_comp_27( para->getParD(level)->d0SP.f[0],      para->getParD(level+1)->d0SP.f[0], 
@@ -1116,6 +1328,30 @@ void fineToCoarse(Parameter* para, int level)
 
 }
 
+void fineToCoarseWithStream(Parameter *para, int level, uint *iCellFCC, uint *iCellFCF, uint k_FC, int streamIndex)
+{
+    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+
+    ScaleFC_RhoSq_comp_27(	para->getParD(level)->d0SP.f[0],      para->getParD(level+1)->d0SP.f[0], 
+							para->getParD(level)->neighborX_SP,   para->getParD(level)->neighborY_SP,    para->getParD(level)->neighborZ_SP, 
+							para->getParD(level+1)->neighborX_SP, para->getParD(level+1)->neighborY_SP,  para->getParD(level+1)->neighborZ_SP, 
+							para->getParD(level)->size_Mat_SP,    para->getParD(level+1)->size_Mat_SP,   para->getParD(level)->evenOrOdd,
+							iCellFCC,                             iCellFCF, 
+							k_FC,                                 para->getParD(level)->omega,           para->getParD(level + 1)->omega, 
+							para->getParD(level)->vis,            para->getParD(level)->nx,              para->getParD(level)->ny, 
+							para->getParD(level+1)->nx,           para->getParD(level+1)->ny,            para->getParD(level)->numberofthreads,
+							para->getParD(level)->offFC,          stream);
+    getLastCudaError("ScaleFC27_RhoSq_comp execution failed");
+
+    //////////////////////////////////////////////////////////////////////////
+    // A D V E C T I O N    D I F F U S I O N
+    //////////////////////////////////////////////////////////////////////////
+
+    if (para->getDiffOn()) {
+        printf("fineToCoarseWithStream Advection Diffusion not implemented"); // TODO
+    }
+}
+
 void coarseToFine(Parameter* para, int level)
 {
     //ScaleCF_comp_D3Q27F3(para->getParD(level)->d0SP.f[0],      para->getParD(level+1)->d0SP.f[0],     para->getParD(level+1)->g6.g[0],
@@ -1159,7 +1395,7 @@ void coarseToFine(Parameter* para, int level)
                           para->getParD(level)->K_CF,             para->getParD(level)->omega,            para->getParD(level + 1)->omega,
                           para->getParD(level)->vis,              para->getParD(level)->nx,               para->getParD(level)->ny,
                           para->getParD(level + 1)->nx,           para->getParD(level + 1)->ny,           para->getParD(level)->numberofthreads,
-                          para->getParD(level)->offCF);
+                          para->getParD(level)->offCF,            cudaStreamLegacy);
     getLastCudaError("ScaleCF27_RhoSq_comp execution failed");
 
     //ScaleCF_AA2016_comp_27( para->getParD(level)->d0SP.f[0],      para->getParD(level+1)->d0SP.f[0],                
@@ -1304,6 +1540,92 @@ void coarseToFine(Parameter* para, int level)
 
 }
 
+void coarseToFineWithStream(Parameter *para, int level, uint *iCellCFC, uint *iCellCFF, uint k_CF, OffCF &offCF,
+                            int streamIndex)
+{
+    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+
+	ScaleCF_RhoSq_comp_27(para->getParD(level)->d0SP.f[0],        para->getParD(level + 1)->d0SP.f[0],
+                          para->getParD(level)->neighborX_SP,     para->getParD(level)->neighborY_SP,     para->getParD(level)->neighborZ_SP,
+                          para->getParD(level + 1)->neighborX_SP, para->getParD(level + 1)->neighborY_SP, para->getParD(level + 1)->neighborZ_SP,
+                          para->getParD(level)->size_Mat_SP,      para->getParD(level + 1)->size_Mat_SP,  para->getParD(level)->evenOrOdd,
+                          iCellCFC,                               iCellCFF,
+                          k_CF,                                   para->getParD(level)->omega,            para->getParD(level + 1)->omega,
+                          para->getParD(level)->vis,              para->getParD(level)->nx,               para->getParD(level)->ny,
+                          para->getParD(level + 1)->nx,           para->getParD(level + 1)->ny,           para->getParD(level)->numberofthreads,
+                          offCF,                                  stream);
+    getLastCudaError("ScaleCF27_RhoSq_comp execution failed");
+
+    if (para->getDiffOn()) {
+        printf("CoarseToFineWithStream Advection Diffusion not implemented"); // TODO
+    }
+}
+
+
+UpdateGrid27::UpdateGrid27(SPtr<Parameter> para, vf::gpu::Communicator &comm, SPtr<CudaMemoryManager> cudaManager,
+                           std::vector<std::shared_ptr<PorousMedia>> &pm, std::vector<SPtr<Kernel>> &kernels)
+    : para(para), comm(comm), cudaManager(cudaManager), pm(pm), kernels(kernels)
+{ 
+    chooseFunctionForCollisionAndExchange();
+    chooseFunctionForRefinementAndExchange();
+}
+
+
+void UpdateGrid27::chooseFunctionForCollisionAndExchange()
+{
+    std::cout << "Function used for collisionAndExchange: ";
+    if (para->getUseStreams() && para->getNumprocs() > 1 && para->getKernelNeedsFluidNodeIndicesToRun()) {
+        this->collisionAndExchange = &UpdateGrid27::collisionAndExchange_streams; 
+        std::cout << "collisionAndExchange_streams()" << std::endl;
+
+    } else if (para->getUseStreams() && !para->getKernelNeedsFluidNodeIndicesToRun()) {
+        std::cout << "Cuda Streams can only be used with kernels which run using fluidNodesIndices." << std::endl;
+
+    } else if (para->getUseStreams() && para->getNumprocs() <= 1) {
+        std::cout << "Cuda Streams can only be used with multiple MPI processes." << std::endl;
+
+    } else if (!para->getUseStreams() && para->getKernelNeedsFluidNodeIndicesToRun()) {
+        this->collisionAndExchange = &UpdateGrid27::collisionAndExchange_noStreams_indexKernel;
+        std::cout << "collisionAndExchange_noStreams_indexKernel()" << std::endl;
+
+    } else if (!para->getUseStreams() && !para->getKernelNeedsFluidNodeIndicesToRun()) {
+        this->collisionAndExchange = &UpdateGrid27::collisionAndExchange_noStreams_oldKernel;
+        std::cout << "collisionAndExchange_noStreams_oldKernel()" << std::endl;
+
+    } else {
+        std::cout << "Invalid Configuration for collision and exchange" << std::endl;
+    }
+}
+
+void UpdateGrid27::chooseFunctionForRefinementAndExchange()
+{
+    std::cout << "Function used for refinementAndExchange: ";
+    if (para->getMaxLevel() == 0) {
+        this->refinementAndExchange = &UpdateGrid27::refinementAndExchange_noRefinementAndExchange;
+        std::cout << "only one level - no function needed." << std::endl;
+
+    } else if (para->getNumprocs() == 1) {
+        this->refinementAndExchange = &UpdateGrid27::refinementAndExchange_noExchange;
+        std::cout << "refinementAndExchange_noExchange()" << std::endl;
+    
+    } else if (para->getNumprocs() > 1 && para->getUseStreams() && para->useReducedCommunicationAfterFtoC) {
+        this->refinementAndExchange = &UpdateGrid27::refinementAndExchange_streams_onlyExchangeInterface;
+        std::cout << "refinementAndExchange_streams_onlyExchangeInterface()" << std::endl;
+    
+    } else if(para->getNumprocs() > 1 && para->getUseStreams() && !para->useReducedCommunicationAfterFtoC){
+        this->refinementAndExchange = &UpdateGrid27::refinementAndExchange_streams_completeExchange; 
+        std::cout << "refinementAndExchange_streams_completeExchange()" << std::endl;
+    
+    } else if (para->getNumprocs() > 1 && !para->getUseStreams() && para->useReducedCommunicationAfterFtoC) {
+        this->refinementAndExchange = &UpdateGrid27::refinementAndExchange_noStreams_onlyExchangeInterface;
+        std::cout << "refinementAndExchange_noStreams_onlyExchangeInterface()" << std::endl;
+
+    } else {
+        this->refinementAndExchange = &UpdateGrid27::refinementAndExchange_noStreams_completeExchange;
+        std::cout << "refinementAndExchange_noStreams_completeExchange()" << std::endl;
+    }
+}
+
 void interactWithActuators(Parameter* para, CudaMemoryManager* cudaManager, int level, unsigned int t)
 {
     for( SPtr<PreCollisionInteractor> actuator: para->getActuators() )
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
index e7e411807c25022957bc0218427b2b367663a23e..2da3c7cb832bbd87f98e846f020ab1f02cb6b6fd 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.h
@@ -10,21 +10,62 @@
 
 class Kernel;
 
-extern "C" void updateGrid27(Parameter* para, 
-                             vf::gpu::Communicator& comm, 
-                             CudaMemoryManager* cudaManager, 
-                             std::vector<std::shared_ptr<PorousMedia>>& pm, 
-                             int level,
-                             unsigned int t, 
-                             std::vector < SPtr< Kernel>>& kernels);
+class UpdateGrid27
+{
+public:
+    UpdateGrid27(SPtr<Parameter> para, vf::gpu::Communicator &comm, SPtr<CudaMemoryManager> cudaManager,
+                 std::vector<std::shared_ptr<PorousMedia>> &pm, std::vector<SPtr<Kernel>> &kernels);
+    void updateGrid(int level, unsigned int t);
 
-extern "C" void collision(Parameter* para, std::vector<std::shared_ptr<PorousMedia>>& pm, int level, unsigned int t, std::vector < SPtr< Kernel>>& kernels);
+private:
+    typedef void (UpdateGrid27::*collisionAndExchangeFun)(int level, unsigned int t);
+    typedef void (UpdateGrid27::*refinementAndExchangeFun)(int level);
+    collisionAndExchangeFun collisionAndExchange   = nullptr;
+    refinementAndExchangeFun refinementAndExchange  = nullptr;
+
+    void chooseFunctionForCollisionAndExchange();
+    void chooseFunctionForRefinementAndExchange();
+
+    // functions for collision and exchange
+    void collisionAndExchange_noStreams_indexKernel(int level, unsigned int t);
+    void collisionAndExchange_noStreams_oldKernel(int level, unsigned int t);
+    void collisionAndExchange_streams(int level, unsigned int t);
+
+    // functions for refinement and exchange
+    void refinementAndExchange_streams_onlyExchangeInterface(int level);
+    void refinementAndExchange_streams_completeExchange(int level);
+    void refinementAndExchange_noStreams_onlyExchangeInterface(int level);
+    void refinementAndExchange_noStreams_completeExchange(int level);
+    void refinementAndExchange_noRefinementAndExchange(int level);
+    void refinementAndExchange_noExchange(int level);
+
+
+    SPtr<Parameter> para;
+    vf::gpu::Communicator& comm;
+    SPtr<CudaMemoryManager> cudaManager;
+    std::vector<std::shared_ptr<PorousMedia>> pm;
+    std::vector<SPtr<Kernel>> kernels;
+};
+
+
+
+extern "C" void collision(Parameter *para, std::vector<std::shared_ptr<PorousMedia>> &pm, int level, unsigned int t,  std::vector<SPtr<Kernel>> &kernels);
+
+extern "C" void collisionUsingIndex(Parameter *para, std::vector<std::shared_ptr<PorousMedia>> &pm, int level, unsigned int t,  std::vector<SPtr<Kernel>> &kernels, uint *fluidNodeIndices = nullptr, uint numberOfFluidNodes = 0, int stream = -1);
 
 extern "C" void collisionPorousMedia(Parameter* para, std::vector<std::shared_ptr<PorousMedia>>& pm, int level);
 
 extern "C" void collisionAdvectionDiffusion(Parameter* para, int level);
 
-extern "C" void exchangeMultiGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
+extern "C" void prepareExchangeMultiGPU(Parameter *para, int level, int streamIndex);
+extern "C" void prepareExchangeMultiGPUAfterFtoC(Parameter *para, int level, int streamIndex);
+
+extern "C" void exchangeMultiGPU(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                 int level, int streamIndex);
+extern "C" void exchangeMultiGPUAfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                 int level, int streamIndex);
+extern "C" void exchangeMultiGPU_noStreams_withPrepare(Parameter *para, vf::gpu::Communicator &comm,
+                                                       CudaMemoryManager *cudaManager, int level, bool useReducedComm);
 
 extern "C" void postCollisionBC(Parameter* para, int level, unsigned int t);
 
@@ -35,8 +76,13 @@ extern "C" void calcMacroscopicQuantities(Parameter* para, int level);
 extern "C" void preCollisionBC(Parameter* para, CudaMemoryManager* cudaManager, int level, unsigned int t);
 
 extern "C" void fineToCoarse(Parameter* para, int level);
+extern "C" void fineToCoarseWithStream(Parameter *para, int level, uint *iCellFCC, uint *iCellFCF, uint k_FC, int streamIndex);
 
 extern "C" void coarseToFine(Parameter* para, int level);
+extern "C" void coarseToFineWithStream(Parameter *para, int level, uint *iCellCFC, uint *iCellCFF, uint k_CF,
+                                       OffCF &offCF, int streamIndex);
+
+
 
 extern "C" void calcTurbulentViscosity(Parameter* para, int level);
 
diff --git a/src/gpu/VirtualFluids_GPU/Communication/Communicator.cpp b/src/gpu/VirtualFluids_GPU/Communication/Communicator.cpp
index 2743f454e321bf21cb4d0b7fd08aab8600a2bee8..155251a3273c8976c058eddad760b8808b451433 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/Communicator.cpp
+++ b/src/gpu/VirtualFluids_GPU/Communication/Communicator.cpp
@@ -28,6 +28,10 @@ Communicator::Communicator()
     MPI_Comm_rank(MPI_COMM_WORLD, &PID);
     MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
 
+    commGPU = MPI_COMM_WORLD;
+    requestGPU.resize(0);
+    rcount = 0;
+
     // Get a new communicator for a decomposition of the domain
     int isperiodic[1] = { 0 };
     MPI_Cart_create(MPI_COMM_WORLD, 1, &numprocs, isperiodic, 1, &comm1d);
@@ -215,4 +219,39 @@ int Communicator::mapCudaDevice(const int &rank, const int &size, const std::vec
     return device;
 }
 
+std::vector<double> Communicator::gatherNUPS(double processNups)
+{ 
+    double *buffer_send = &processNups;
+    double *buffer_recv = (double *)malloc(sizeof(double) * this->numprocs);
+
+    MPI_Gather(buffer_send, 1, MPI_DOUBLE, buffer_recv, 1, MPI_DOUBLE, 0, commGPU);
+
+    if (this->PID == 0)
+        return std::vector<double>(buffer_recv, buffer_recv + this->numprocs);
+    return std::vector<double>(); 
 }
+
+double Communicator::sumNups(double processNups)
+{ 
+    double *buffer_send = &processNups;
+    double *buffer_recv = (double *)malloc(sizeof(double));
+
+    MPI_Reduce(buffer_send, buffer_recv, 1, MPI_DOUBLE, MPI_SUM, 0, commGPU);
+
+    return *buffer_recv;
+}
+
+void vf::gpu::Communicator::exchangeIndices(uint *rbuf, int count_r, int nb_rank_r, uint *sbuf, int count_s,
+                                            int nb_rank_s)
+{
+    MPI_Request recv_request;
+    MPI_Irecv(rbuf, count_r, MPI_UNSIGNED, nb_rank_r, 0, commGPU, &recv_request);
+    //printf("exchangeIndices PID: %i,   nbRev: nb_rank_recv: %i", this->getPID(), nb_rank_r);
+    //fflush(stdout);
+    MPI_Send(sbuf, count_s, MPI_UNSIGNED, nb_rank_s, 0, commGPU);
+    //printf("exchangeIndices PID: %i,   sendUintGPU: nb_rank_send: %i", this->getPID(), nb_rank_s);
+    //fflush(stdout);
+    MPI_Wait(&recv_request, MPI_STATUSES_IGNORE);
+}
+
+} // namespace vf::gpu
diff --git a/src/gpu/VirtualFluids_GPU/Communication/Communicator.h b/src/gpu/VirtualFluids_GPU/Communication/Communicator.h
index 256dde87e8ff6b3a8c7abcae0ac31466cc68ba95..6227dbd8210ea27013ad252cf64f399c611a9d75 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/Communicator.h
+++ b/src/gpu/VirtualFluids_GPU/Communication/Communicator.h
@@ -57,6 +57,10 @@ public:
     void stopTimer();
     double getTime();
     int mapCudaDevice(const int &rank, const int &size, const std::vector<unsigned int> &devices, const int &maxdev);
+    std::vector<double> gatherNUPS(double processNups);
+    double sumNups(double processNups);
+    //////////////////////////////////////////////////////////////////////////
+    void exchangeIndices(uint *rbuf, int count_r, int nb_rank_r, uint *sbuf, int count_s, int nb_rank_s);
 private:
    int numprocs, PID;
    int nbrbottom, nbrtop; 
diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
index d91e86c3140bb08aa2d8ef28d7cc147b23a2b804..60c45c3dcb3805da61d3a4a78ff43323b329339b 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.cpp
@@ -1,511 +1,394 @@
 #include "Communication/ExchangeData27.h"
+#include "Parameter/CudaStreamManager.h"
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//3D domain decomposition
+// 3D domain decomposition
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// X
+// 3D domain decomposition: functions used by all directions
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePreCollDataXGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
+void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex,
+                                 std::vector<ProcessNeighbor27> *sendProcessNeighbor,
+                                 unsigned int numberOfSendProcessNeighbors)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		GetSendFsPreDev27(para->getParD(level)->d0SP.f[0],
-						  para->getParD(level)->sendProcessNeighborX[i].f[0],
-						  para->getParD(level)->sendProcessNeighborX[i].index,
-						  para->getParD(level)->sendProcessNeighborX[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborXFsDH(level, i);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborX[i].f[0],
-							para->getParH(level)->recvProcessNeighborX[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighborX[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////start non blocking MPI send
-	//for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	//{
-	//	comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0],
-	//						para->getParH(level)->sendProcessNeighborX[i].numberOfFs,
-	//						para->getParH(level)->sendProcessNeighborX[i].rankNeighbor);
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////Waitall
-	//if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
-	//{
-	//	comm.waitallGPU();
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.sendDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0],
-						  para->getParH(level)->sendProcessNeighborX[i].numberOfFs,
-						  para->getParH(level)->sendProcessNeighborX[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborXFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0],
-						  para->getParD(level)->recvProcessNeighborX[i].f[0],
-						  para->getParD(level)->recvProcessNeighborX[i].index,
-						  para->getParD(level)->recvProcessNeighborX[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+
+    for (unsigned int i = 0; i < numberOfSendProcessNeighbors; i++) {
+        GetSendFsPostDev27(para->getParD(level)->d0SP.f[0], 
+                           (*sendProcessNeighbor)[i].f[0],
+                           (*sendProcessNeighbor)[i].index, 
+                           (*sendProcessNeighbor)[i].numberOfNodes,
+                           para->getParD(level)->neighborX_SP, 
+                           para->getParD(level)->neighborY_SP,
+                           para->getParD(level)->neighborZ_SP, 
+                           para->getParD(level)->size_Mat_SP,
+                           para->getParD(level)->evenOrOdd, 
+                           para->getParD(level)->numberofthreads, 
+                           stream);
+    }
 }
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePostCollDataXGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
+
+void scatterNodesFromRecvBufferGPU(Parameter *para, int level, int streamIndex,
+                                   std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
+                                   unsigned int numberOfRecvProcessNeighbors)
+{
+    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+    for (unsigned int i = 0; i < numberOfRecvProcessNeighbors; i++) {
+        SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0], 
+                           (*recvProcessNeighborDev)[i].f[0],
+                           (*recvProcessNeighborDev)[i].index, 
+                           (*recvProcessNeighborDev)[i].numberOfNodes,
+                           para->getParD(level)->neighborX_SP, 
+                           para->getParD(level)->neighborY_SP,
+                           para->getParD(level)->neighborZ_SP, 
+                           para->getParD(level)->size_Mat_SP,
+                           para->getParD(level)->evenOrOdd, 
+                           para->getParD(level)->numberofthreads, 
+                           stream);
+    }
+}
+
+void startBlockingMpiSend(unsigned int numberOfSendProcessNeighbors, vf::gpu::Communicator &comm,
+                          std::vector<ProcessNeighbor27> *sendProcessNeighborHost)
+{
+    for (unsigned int i = 0; i < numberOfSendProcessNeighbors; i++) {
+        comm.sendDataGPU((*sendProcessNeighborHost)[i].f[0], 
+                          (*sendProcessNeighborHost)[i].numberOfFs,
+                          (*sendProcessNeighborHost)[i].rankNeighbor);
+    }
+}
+
+void startNonBlockingMpiReceive(unsigned int numberOfSendProcessNeighbors, vf::gpu::Communicator &comm,
+                                std::vector<ProcessNeighbor27> *recvProcessNeighborHost)
+{
+    for (unsigned int i = 0; i < numberOfSendProcessNeighbors; i++) {
+        comm.nbRecvDataGPU((*recvProcessNeighborHost)[i].f[0], 
+                            (*recvProcessNeighborHost)[i].numberOfFs,
+                            (*recvProcessNeighborHost)[i].rankNeighbor);
+    }
+}
+
+void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeNodes,
+                   std::vector<ProcessNeighbor27> &recvProcessNeighborHost,
+                   std::vector<ProcessNeighbor27> &sendProcessNeighborHost)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		GetSendFsPostDev27(para->getParD(level)->d0SP.f[0],
-						   para->getParD(level)->sendProcessNeighborX[i].f[0],
-						   para->getParD(level)->sendProcessNeighborX[i].index,
-						   para->getParD(level)->sendProcessNeighborX[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborXFsDH(level, i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborX[i].f[0],
-							para->getParH(level)->recvProcessNeighborX[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighborX[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////start non blocking MPI send
-	//for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	//{
-	//	comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0],
-	//						para->getParH(level)->sendProcessNeighborX[i].numberOfFs,
-	//						para->getParH(level)->sendProcessNeighborX[i].rankNeighbor);
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////Waitall
-	//if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
-	//{
-	//	comm.waitallGPU();
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.sendDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0],
-						  para->getParH(level)->sendProcessNeighborX[i].numberOfFs,
-						  para->getParH(level)->sendProcessNeighborX[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborXFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0],
-						   para->getParD(level)->recvProcessNeighborX[i].f[0],
-						   para->getParD(level)->recvProcessNeighborX[i].index,
-						   para->getParD(level)->recvProcessNeighborX[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    int indexInSubdomainRecv = 0;
+    int indexInSubdomainSend = 0;
+    int numNodesInBufferRecv = 0;
+    int numNodesInBufferSend = 0;
+
+#pragma omp parallel for
+    for (uint i = 0; i < edgeNodes.size(); i++) {
+        indexInSubdomainRecv = edgeNodes[i].indexOfProcessNeighborRecv;
+        indexInSubdomainSend = edgeNodes[i].indexOfProcessNeighborSend;
+        numNodesInBufferRecv = recvProcessNeighborHost[indexInSubdomainRecv].numberOfNodes;
+        numNodesInBufferSend = sendProcessNeighborHost[indexInSubdomainSend].numberOfNodes;
+        if(edgeNodes[i].indexInSendBuffer >= numNodesInBufferSend){
+            // for reduced communication after fine to coarse: only copy send nodes which are not part of the reduced comm
+            continue;
+        }
+
+        // copy fs for all directions
+        for (int direction = 0; direction <= (int)dirEND; direction++) {
+            (sendProcessNeighborHost[indexInSubdomainSend].f[0] +
+             (direction * numNodesInBufferSend))[edgeNodes[i].indexInSendBuffer] =
+                (recvProcessNeighborHost[indexInSubdomainRecv].f[0] +
+                 (direction * numNodesInBufferRecv))[edgeNodes[i].indexInRecvBuffer];
+        }
+    }
 }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// X
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, int streamIndex)
+{
+    collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborX,
+                                (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
+}
+
+void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
+{
+    collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborsAfterFtoCX,
+                                (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
+}
+
+void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                    int level, int streamIndex)
+{
+    exchangeCollDataXGPU27(para, comm, cudaManager, level, streamIndex, 
+                           &para->getParD(level)->sendProcessNeighborX,
+                           &para->getParD(level)->recvProcessNeighborX,
+                           &para->getParH(level)->sendProcessNeighborX,
+                           &para->getParH(level)->recvProcessNeighborX);
+}
+
+void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                     int level, int streamIndex)
+{
+    exchangeCollDataXGPU27(para, comm, cudaManager, level, streamIndex, 
+                           &para->getParD(level)->sendProcessNeighborsAfterFtoCX,
+                           &para->getParD(level)->recvProcessNeighborsAfterFtoCX,
+                           &para->getParH(level)->sendProcessNeighborsAfterFtoCX,
+                           &para->getParH(level)->recvProcessNeighborsAfterFtoCX);
+}
 
+void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex)
+{
+    scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborX,
+                                  (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
+}
 
+void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
+{
+    scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborsAfterFtoCX,
+                                  (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")));
+}
 
+void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager, int level,
+                            int streamIndex, 
+                            std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
+                            std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
+                            std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
+                            std::vector<ProcessNeighbor27> *recvProcessNeighborHost)
+{
+    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //! \details steps: 
+
+    //! 1. copy data from device to host
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+        cudaManager->cudaCopyProcessNeighborXFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex);
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //! 2. start non-blocking receive (MPI)
+    startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost);
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //! 3. before sending data, wait for memcopy (from device to host) to finish 
+    if (para->getUseStreams()) cudaStreamSynchronize(stream); 
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //! 4. send data to neighboring process (MPI)
+    startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //! 5. wait for until data is received
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++) comm.waitGPU(i);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //! 6. reset the request array, which was used for the mpi communication
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send"))) comm.resetRequest();
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //! 7. copy received data from host to device
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+        cudaManager->cudaCopyProcessNeighborXFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Y
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePreCollDataYGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
+void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, int streamIndex)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		GetSendFsPreDev27(para->getParD(level)->d0SP.f[0],
-						  para->getParD(level)->sendProcessNeighborY[i].f[0],
-						  para->getParD(level)->sendProcessNeighborY[i].index,
-						  para->getParD(level)->sendProcessNeighborY[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborYFsDH(level, i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborY[i].f[0],
-							para->getParH(level)->recvProcessNeighborY[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighborY[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////start non blocking MPI send
-	//for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	//{
-	//	comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0],
-	//						para->getParH(level)->sendProcessNeighborY[i].numberOfFs,
-	//						para->getParH(level)->sendProcessNeighborY[i].rankNeighbor);
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////Waitall
-	//if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
-	//{
-	//	comm.waitallGPU();
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.sendDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0],
-						  para->getParH(level)->sendProcessNeighborY[i].numberOfFs,
-			              para->getParH(level)->sendProcessNeighborY[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborYFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0],
-						  para->getParD(level)->recvProcessNeighborY[i].f[0],
-						  para->getParD(level)->recvProcessNeighborY[i].index,
-						  para->getParD(level)->recvProcessNeighborY[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborY,
+                                (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
 }
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePostCollDataYGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
+
+void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		GetSendFsPostDev27(para->getParD(level)->d0SP.f[0],
-						   para->getParD(level)->sendProcessNeighborY[i].f[0],
-						   para->getParD(level)->sendProcessNeighborY[i].index,
-						   para->getParD(level)->sendProcessNeighborY[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborYFsDH(level, i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborY[i].f[0],
-							para->getParH(level)->recvProcessNeighborY[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighborY[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////start non blocking MPI send
-	//for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	//{
-	//	comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0],
-	//						para->getParH(level)->sendProcessNeighborY[i].numberOfFs,
-	//						para->getParH(level)->sendProcessNeighborY[i].rankNeighbor);
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////Waitall
-	//if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
-	//{
-	//	comm.waitallGPU();
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.sendDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0],
-						  para->getParH(level)->sendProcessNeighborY[i].numberOfFs,
-			              para->getParH(level)->sendProcessNeighborY[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborYFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0],
-						   para->getParD(level)->recvProcessNeighborY[i].f[0],
-						   para->getParD(level)->recvProcessNeighborY[i].index,
-						   para->getParD(level)->recvProcessNeighborY[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborsAfterFtoCY,
+                                (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
 }
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
+void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                    int level, int streamIndex)
+{
+    exchangeCollDataYGPU27(para, comm, cudaManager, level, streamIndex, 
+                           &para->getParD(level)->sendProcessNeighborY,
+                           &para->getParD(level)->recvProcessNeighborY, 
+                           &para->getParH(level)->sendProcessNeighborY,
+                           &para->getParH(level)->recvProcessNeighborY);
+}
+
+void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                     int level, int streamIndex)
+{
+    exchangeCollDataYGPU27(para, comm, cudaManager, level, streamIndex, 
+                           &para->getParD(level)->sendProcessNeighborsAfterFtoCY,
+                           &para->getParD(level)->recvProcessNeighborsAfterFtoCY, 
+                           &para->getParH(level)->sendProcessNeighborsAfterFtoCY,
+                           &para->getParH(level)->recvProcessNeighborsAfterFtoCY);
+}
 
+void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex)
+{
+    scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborY,
+                                  (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
+}
 
+void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
+{
+    scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborsAfterFtoCY,
+                                  (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")));
+}
+
+void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager, int level,
+                            int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
+                            std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
+                            std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
+                            std::vector<ProcessNeighbor27> *recvProcessNeighborHost)
+{
+    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // copy Device to Host
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+        cudaManager->cudaCopyProcessNeighborYFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex);
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // wait for memcopy device to host to finish before sending data
+    if (para->getUseStreams())
+        cudaStreamSynchronize(stream);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // edge nodes: copy received node values from x
+    if (para->getUseStreams() && para->getNumberOfProcessNeighborsX(level, "recv") > 0 &&
+        para->getParH(level)->sendProcessNeighborY.size() != 0) {
+        if (para->getParH(level)->sendProcessNeighborY[0].numberOfNodes ==
+            (*sendProcessNeighborHost)[0].numberOfNodes) {
+            // check if in communication of all nodes (as opposed to reduced communication after fine to coarse)
+            copyEdgeNodes(para->getParH(level)->edgeNodesXtoY, para->getParH(level)->recvProcessNeighborX,
+                          *sendProcessNeighborHost);
+        } else {
+            copyEdgeNodes(para->getParH(level)->edgeNodesXtoY, para->getParH(level)->recvProcessNeighborsAfterFtoCX,
+                          *sendProcessNeighborHost);
+        }
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // wait
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) comm.waitGPU(i);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // reset the request array
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send"))) comm.resetRequest();
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // copy Host to Device
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++) {
+        cudaManager->cudaCopyProcessNeighborYFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Z
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePreCollDataZGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
+void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, int streamIndex)
+{
+    collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborZ,
+                                (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")));
+}
+
+void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
+{
+    collectNodesInSendBufferGPU(para, level, streamIndex, &para->getParD(level)->sendProcessNeighborsAfterFtoCZ,
+                                (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")));
+}
+
+void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                    int level, int streamIndex)
+{
+    exchangeCollDataZGPU27(para, comm, cudaManager, level, streamIndex, 
+                           &para->getParD(level)->sendProcessNeighborZ,
+                           &para->getParD(level)->recvProcessNeighborZ, 
+                           &para->getParH(level)->sendProcessNeighborZ,
+                           &para->getParH(level)->recvProcessNeighborZ);
+}
+void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                     int level, int streamIndex)
+{
+    exchangeCollDataZGPU27(para, comm, cudaManager, level, streamIndex, 
+                           &para->getParD(level)->sendProcessNeighborsAfterFtoCZ,
+                           &para->getParD(level)->recvProcessNeighborsAfterFtoCZ, 
+                           &para->getParH(level)->sendProcessNeighborsAfterFtoCZ,
+                           &para->getParH(level)->recvProcessNeighborsAfterFtoCZ);
+}
+
+void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, int streamIndex)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		GetSendFsPreDev27(para->getParD(level)->d0SP.f[0],
-						  para->getParD(level)->sendProcessNeighborZ[i].f[0],
-						  para->getParD(level)->sendProcessNeighborZ[i].index,
-						  para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborZFsDH(level, i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborZ[i].f[0],
-							para->getParH(level)->recvProcessNeighborZ[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighborZ[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////start non blocking MPI send
-	//for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	//{
-	//	comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0],
-	//						para->getParH(level)->sendProcessNeighborZ[i].numberOfFs,
-	//						para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor);
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////Waitall
-	//if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
-	//{
-	//	comm.waitallGPU();
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.sendDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0],
-						  para->getParH(level)->sendProcessNeighborZ[i].numberOfFs,
-						  para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborZFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0],
-						  para->getParD(level)->recvProcessNeighborZ[i].f[0],
-						  para->getParD(level)->recvProcessNeighborZ[i].index,
-						  para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborZ,
+                                  (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")));
 }
+
+void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, int streamIndex)
+{
+    scatterNodesFromRecvBufferGPU(para, level, streamIndex, &para->getParD(level)->recvProcessNeighborsAfterFtoCZ,
+                                  (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")));
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void exchangePostCollDataZGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
+void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager, int level,
+                            int streamIndex, std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
+                            std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
+                            std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
+                            std::vector<ProcessNeighbor27> *recvProcessNeighborHost)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		GetSendFsPostDev27(para->getParD(level)->d0SP.f[0],
-						   para->getParD(level)->sendProcessNeighborZ[i].f[0],
-						   para->getParD(level)->sendProcessNeighborZ[i].index,
-						   para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborZFsDH(level, i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborZ[i].f[0],
-							para->getParH(level)->recvProcessNeighborZ[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighborZ[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////start non blocking MPI send
-	//for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	//{
-	//	comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0],
-	//						para->getParH(level)->sendProcessNeighborZ[i].numberOfFs,
-	//						para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor);
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////Waitall
-	//if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
-	//{
-	//	comm.waitallGPU();
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.sendDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0],
-						  para->getParH(level)->sendProcessNeighborZ[i].numberOfFs,
-						  para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborZFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0],
-						   para->getParD(level)->recvProcessNeighborZ[i].f[0],
-						   para->getParD(level)->recvProcessNeighborZ[i].index,
-						   para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // copy Device to Host
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+        cudaManager->cudaCopyProcessNeighborZFsDH(level, i, (*sendProcessNeighborDev)[i].memsizeFs, streamIndex);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    startNonBlockingMpiReceive((unsigned int)(*sendProcessNeighborHost).size(), comm, recvProcessNeighborHost);
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // wait for memcopy device to host to finish before sending data
+    if (para->getUseStreams())
+        cudaStreamSynchronize(stream);
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // edge nodes: copy received node values from x
+    if (para->getUseStreams() && para->getNumberOfProcessNeighborsX(level, "recv") > 0 &&
+        para->getParH(level)->sendProcessNeighborZ.size() != 0) {
+        if (para->getParH(level)->sendProcessNeighborZ[0].numberOfNodes ==
+            (*sendProcessNeighborHost)[0].numberOfNodes) {
+            // check if in communication of all nodes (as opposed to reduced communication after fine to coarse)
+            copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, para->getParH(level)->recvProcessNeighborX,
+                          *sendProcessNeighborHost);
+        } else {
+            copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, para->getParH(level)->recvProcessNeighborsAfterFtoCX,
+                          *sendProcessNeighborHost);
+        }
+    }
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // edge nodes: copy received node values from y
+    if (para->getUseStreams() && para->getNumberOfProcessNeighborsY(level, "recv") > 0 &&
+        para->getParH(level)->sendProcessNeighborZ.size() != 0) {
+        if (para->getParH(level)->sendProcessNeighborZ[0].numberOfNodes ==
+            (*sendProcessNeighborHost)[0].numberOfNodes) {
+            // check if in communication of all nodes (as opposed to reduced communication after fine to coarse)
+            copyEdgeNodes(para->getParH(level)->edgeNodesYtoZ, para->getParH(level)->recvProcessNeighborY,
+                          *sendProcessNeighborHost);
+        } else {
+            copyEdgeNodes(para->getParH(level)->edgeNodesYtoZ, para->getParH(level)->recvProcessNeighborsAfterFtoCY,
+                          *sendProcessNeighborHost);
+        }
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    startBlockingMpiSend((unsigned int)(*sendProcessNeighborHost).size(), comm, sendProcessNeighborHost);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // wait
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++) comm.waitGPU(i);
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // reset the request array
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send"))) comm.resetRequest();
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // copy Host to Device
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        cudaManager->cudaCopyProcessNeighborZFsHD(level, i, (*recvProcessNeighborDev)[i].memsizeFs, streamIndex);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 
@@ -531,42 +414,42 @@ void exchangePostCollDataZGPU27(Parameter* para, vf::gpu::Communicator& comm, Cu
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void exchangePreCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
 {
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighbors(level, "send")); i++)
-	{
-		//////////////////////////////////////////////////////////////////////////
-		GetSendFsPreDev27(para->getParD(level)->d0SP.f[0],
-						  para->getParD(level)->sendProcessNeighbor[i].f[0],
-						  para->getParD(level)->sendProcessNeighbor[i].index,
-						  para->getParD(level)->sendProcessNeighbor[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborFsDH(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		comm.exchngDataGPU(para->getParH(level)->sendProcessNeighbor[i].f[0], 
-							para->getParH(level)->sendProcessNeighbor[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighbor[i].f[0],
-							para->getParH(level)->recvProcessNeighbor[i].numberOfFs,
-							para->getParH(level)->sendProcessNeighbor[i].rankNeighbor);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0],
-						  para->getParD(level)->recvProcessNeighbor[i].f[0],
-						  para->getParD(level)->recvProcessNeighbor[i].index,
-						  para->getParD(level)->recvProcessNeighbor[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-	}
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighbors(level, "send")); i++)
+    {
+        //////////////////////////////////////////////////////////////////////////
+        GetSendFsPreDev27(para->getParD(level)->d0SP.f[0],
+                          para->getParD(level)->sendProcessNeighbor[i].f[0],
+                          para->getParD(level)->sendProcessNeighbor[i].index,
+                          para->getParD(level)->sendProcessNeighbor[i].numberOfNodes,
+                          para->getParD(level)->neighborX_SP, 
+                          para->getParD(level)->neighborY_SP, 
+                          para->getParD(level)->neighborZ_SP,
+                          para->getParD(level)->size_Mat_SP, 
+                          para->getParD(level)->evenOrOdd,
+                          para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborFsDH(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighbor[i].f[0], 
+                            para->getParH(level)->sendProcessNeighbor[i].numberOfFs,
+                            para->getParH(level)->recvProcessNeighbor[i].f[0],
+                            para->getParH(level)->recvProcessNeighbor[i].numberOfFs,
+                            para->getParH(level)->sendProcessNeighbor[i].rankNeighbor);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborFsHD(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0],
+                          para->getParD(level)->recvProcessNeighbor[i].f[0],
+                          para->getParD(level)->recvProcessNeighbor[i].index,
+                          para->getParD(level)->recvProcessNeighbor[i].numberOfNodes,
+                          para->getParD(level)->neighborX_SP, 
+                          para->getParD(level)->neighborY_SP, 
+                          para->getParD(level)->neighborZ_SP,
+                          para->getParD(level)->size_Mat_SP, 
+                          para->getParD(level)->evenOrOdd,
+                          para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -576,131 +459,83 @@ void exchangePreCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cuda
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
 {
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighbors(level, "send")); i++)
-	{
-		//////////////////////////////////////////////////////////////////////////
-		GetSendFsPostDev27(para->getParD(level)->d0SP.f[0],
-						   para->getParD(level)->sendProcessNeighbor[i].f[0],
-						   para->getParD(level)->sendProcessNeighbor[i].index,
-						   para->getParD(level)->sendProcessNeighbor[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborFsDH(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		comm.exchngDataGPU(para->getParH(level)->sendProcessNeighbor[i].f[0], 
-							para->getParH(level)->sendProcessNeighbor[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighbor[i].f[0],
-							para->getParH(level)->recvProcessNeighbor[i].numberOfFs,
-							para->getParH(level)->sendProcessNeighbor[i].rankNeighbor);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0],
-						   para->getParD(level)->recvProcessNeighbor[i].f[0],
-						   para->getParD(level)->recvProcessNeighbor[i].index,
-						   para->getParD(level)->recvProcessNeighbor[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-	}
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighbors(level, "send")); i++)
+    {
+        //////////////////////////////////////////////////////////////////////////
+        GetSendFsPostDev27(para->getParD(level)->d0SP.f[0],
+                           para->getParD(level)->sendProcessNeighbor[i].f[0],
+                           para->getParD(level)->sendProcessNeighbor[i].index,
+                           para->getParD(level)->sendProcessNeighbor[i].numberOfNodes,
+                           para->getParD(level)->neighborX_SP, 
+                           para->getParD(level)->neighborY_SP, 
+                           para->getParD(level)->neighborZ_SP,
+                           para->getParD(level)->size_Mat_SP, 
+                           para->getParD(level)->evenOrOdd,
+                           para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborFsDH(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighbor[i].f[0], 
+                            para->getParH(level)->sendProcessNeighbor[i].numberOfFs,
+                            para->getParH(level)->recvProcessNeighbor[i].f[0],
+                            para->getParH(level)->recvProcessNeighbor[i].numberOfFs,
+                            para->getParH(level)->sendProcessNeighbor[i].rankNeighbor);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborFsHD(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        SetRecvFsPostDev27(para->getParD(level)->d0SP.f[0],
+                           para->getParD(level)->recvProcessNeighbor[i].f[0],
+                           para->getParD(level)->recvProcessNeighbor[i].index,
+                           para->getParD(level)->recvProcessNeighbor[i].numberOfNodes,
+                           para->getParD(level)->neighborX_SP, 
+                           para->getParD(level)->neighborY_SP, 
+                           para->getParD(level)->neighborZ_SP,
+                           para->getParD(level)->size_Mat_SP, 
+                           para->getParD(level)->evenOrOdd,
+                           para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+    }
 }
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-
 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////3D domain decomposition
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//// X
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//void exchangePreCollDataXGPU27(Parameter* para, vf::gpu::Communicator& comm, int level)
-//{
-//	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-//	{
-//		//////////////////////////////////////////////////////////////////////////
-//		GetSendFsPreDev27(para->getParD(level)->d0SP.f[0],
-//						  para->getParD(level)->sendProcessNeighborX[i].f[0],
-//						  para->getParD(level)->sendProcessNeighborX[i].index,
-//						  para->getParD(level)->sendProcessNeighborX[i].numberOfNodes,
-//						  para->getParD(level)->neighborX_SP, 
-//						  para->getParD(level)->neighborY_SP, 
-//						  para->getParD(level)->neighborZ_SP,
-//						  para->getParD(level)->size_Mat_SP, 
-//						  para->getParD(level)->evenOrOdd,
-//						  para->getParD(level)->numberofthreads);
-//		//////////////////////////////////////////////////////////////////////////
-//		para->cudaCopyProcessNeighborXFsDH(level, i);
-//		//////////////////////////////////////////////////////////////////////////
-//		comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], 
-//							para->getParH(level)->sendProcessNeighborX[i].numberOfFs,
-//							para->getParH(level)->recvProcessNeighborX[i].f[0],
-//							para->getParH(level)->recvProcessNeighborX[i].numberOfFs,
-//							para->getParH(level)->sendProcessNeighborX[i].rankNeighbor);
-//		//////////////////////////////////////////////////////////////////////////
-//		para->cudaCopyProcessNeighborXFsHD(level, i);
-//		//////////////////////////////////////////////////////////////////////////
-//		SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0],
-//						  para->getParD(level)->recvProcessNeighborX[i].f[0],
-//						  para->getParD(level)->recvProcessNeighborX[i].index,
-//						  para->getParD(level)->recvProcessNeighborX[i].numberOfNodes,
-//						  para->getParD(level)->neighborX_SP, 
-//						  para->getParD(level)->neighborY_SP, 
-//						  para->getParD(level)->neighborZ_SP,
-//						  para->getParD(level)->size_Mat_SP, 
-//						  para->getParD(level)->evenOrOdd,
-//						  para->getParD(level)->numberofthreads);
-//		//////////////////////////////////////////////////////////////////////////
-//	}
-//}
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //void exchangePostCollDataXGPU27(Parameter* para, vf::gpu::Communicator& comm, int level)
 //{
-//	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-//	{
-//		//////////////////////////////////////////////////////////////////////////
-//		GetSendFsPostDev27( para->getParD(level)->d0SP.f[0],
-//							para->getParD(level)->sendProcessNeighborX[i].f[0],
-//							para->getParD(level)->sendProcessNeighborX[i].index,
-//							para->getParD(level)->sendProcessNeighborX[i].numberOfNodes,
-//							para->getParD(level)->neighborX_SP, 
-//							para->getParD(level)->neighborY_SP, 
-//							para->getParD(level)->neighborZ_SP,
-//							para->getParD(level)->size_Mat_SP, 
-//							para->getParD(level)->evenOrOdd,
-//							para->getParD(level)->numberofthreads);
-//		//////////////////////////////////////////////////////////////////////////
-//		para->cudaCopyProcessNeighborXFsDH(level, i);
-//		//////////////////////////////////////////////////////////////////////////
-//		comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], 
-//							para->getParH(level)->sendProcessNeighborX[i].numberOfFs,
-//							para->getParH(level)->recvProcessNeighborX[i].f[0],
-//							para->getParH(level)->recvProcessNeighborX[i].numberOfFs,
-//							para->getParH(level)->sendProcessNeighborX[i].rankNeighbor);
-//		//////////////////////////////////////////////////////////////////////////
-//		para->cudaCopyProcessNeighborXFsHD(level, i);
-//		//////////////////////////////////////////////////////////////////////////
-//		SetRecvFsPostDev27( para->getParD(level)->d0SP.f[0],
-//							para->getParD(level)->recvProcessNeighborX[i].f[0],
-//							para->getParD(level)->recvProcessNeighborX[i].index,
-//							para->getParD(level)->recvProcessNeighborX[i].numberOfNodes,
-//							para->getParD(level)->neighborX_SP, 
-//							para->getParD(level)->neighborY_SP, 
-//							para->getParD(level)->neighborZ_SP,
-//							para->getParD(level)->size_Mat_SP, 
-//							para->getParD(level)->evenOrOdd,
-//							para->getParD(level)->numberofthreads);
-//		//////////////////////////////////////////////////////////////////////////
-//	}
+//    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+//    {
+//        //////////////////////////////////////////////////////////////////////////
+//        GetSendFsPostDev27( para->getParD(level)->d0SP.f[0],
+//                            para->getParD(level)->sendProcessNeighborX[i].f[0],
+//                            para->getParD(level)->sendProcessNeighborX[i].index,
+//                            para->getParD(level)->sendProcessNeighborX[i].numberOfNodes,
+//                            para->getParD(level)->neighborX_SP, 
+//                            para->getParD(level)->neighborY_SP, 
+//                            para->getParD(level)->neighborZ_SP,
+//                            para->getParD(level)->size_Mat_SP, 
+//                            para->getParD(level)->evenOrOdd,
+//                            para->getParD(level)->numberofthreads);
+//        //////////////////////////////////////////////////////////////////////////
+//        para->cudaCopyProcessNeighborXFsDH(level, i);
+//        //////////////////////////////////////////////////////////////////////////
+//        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborX[i].f[0], 
+//                            para->getParH(level)->sendProcessNeighborX[i].numberOfFs,
+//                            para->getParH(level)->recvProcessNeighborX[i].f[0],
+//                            para->getParH(level)->recvProcessNeighborX[i].numberOfFs,
+//                            para->getParH(level)->sendProcessNeighborX[i].rankNeighbor);
+//        //////////////////////////////////////////////////////////////////////////
+//        para->cudaCopyProcessNeighborXFsHD(level, i);
+//        //////////////////////////////////////////////////////////////////////////
+//        SetRecvFsPostDev27( para->getParD(level)->d0SP.f[0],
+//                            para->getParD(level)->recvProcessNeighborX[i].f[0],
+//                            para->getParD(level)->recvProcessNeighborX[i].index,
+//                            para->getParD(level)->recvProcessNeighborX[i].numberOfNodes,
+//                            para->getParD(level)->neighborX_SP, 
+//                            para->getParD(level)->neighborY_SP, 
+//                            para->getParD(level)->neighborZ_SP,
+//                            para->getParD(level)->size_Mat_SP, 
+//                            para->getParD(level)->evenOrOdd,
+//                            para->getParD(level)->numberofthreads);
+//        //////////////////////////////////////////////////////////////////////////
+//    }
 //}
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //
@@ -712,82 +547,82 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //void exchangePreCollDataYGPU27(Parameter* para, vf::gpu::Communicator& comm, int level)
 //{
-//	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-//	{
-//		//////////////////////////////////////////////////////////////////////////
-//		GetSendFsPreDev27(para->getParD(level)->d0SP.f[0],
-//						  para->getParD(level)->sendProcessNeighborY[i].f[0],
-//						  para->getParD(level)->sendProcessNeighborY[i].index,
-//						  para->getParD(level)->sendProcessNeighborY[i].numberOfNodes,
-//						  para->getParD(level)->neighborX_SP, 
-//						  para->getParD(level)->neighborY_SP, 
-//						  para->getParD(level)->neighborZ_SP,
-//						  para->getParD(level)->size_Mat_SP, 
-//						  para->getParD(level)->evenOrOdd,
-//						  para->getParD(level)->numberofthreads);
-//		//////////////////////////////////////////////////////////////////////////
-//		para->cudaCopyProcessNeighborYFsDH(level, i);
-//		//////////////////////////////////////////////////////////////////////////
-//		comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], 
-//							para->getParH(level)->sendProcessNeighborY[i].numberOfFs,
-//							para->getParH(level)->recvProcessNeighborY[i].f[0],
-//							para->getParH(level)->recvProcessNeighborY[i].numberOfFs,
-//							para->getParH(level)->sendProcessNeighborY[i].rankNeighbor);
-//		//////////////////////////////////////////////////////////////////////////
-//		para->cudaCopyProcessNeighborYFsHD(level, i);
-//		//////////////////////////////////////////////////////////////////////////
-//		SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0],
-//						  para->getParD(level)->recvProcessNeighborY[i].f[0],
-//						  para->getParD(level)->recvProcessNeighborY[i].index,
-//						  para->getParD(level)->recvProcessNeighborY[i].numberOfNodes,
-//						  para->getParD(level)->neighborX_SP, 
-//						  para->getParD(level)->neighborY_SP, 
-//						  para->getParD(level)->neighborZ_SP,
-//						  para->getParD(level)->size_Mat_SP, 
-//						  para->getParD(level)->evenOrOdd,
-//						  para->getParD(level)->numberofthreads);
-//		//////////////////////////////////////////////////////////////////////////
-//	}
+//    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+//    {
+//        //////////////////////////////////////////////////////////////////////////
+//        GetSendFsPreDev27(para->getParD(level)->d0SP.f[0],
+//                          para->getParD(level)->sendProcessNeighborY[i].f[0],
+//                          para->getParD(level)->sendProcessNeighborY[i].index,
+//                          para->getParD(level)->sendProcessNeighborY[i].numberOfNodes,
+//                          para->getParD(level)->neighborX_SP, 
+//                          para->getParD(level)->neighborY_SP, 
+//                          para->getParD(level)->neighborZ_SP,
+//                          para->getParD(level)->size_Mat_SP, 
+//                          para->getParD(level)->evenOrOdd,
+//                          para->getParD(level)->numberofthreads);
+//        //////////////////////////////////////////////////////////////////////////
+//        para->cudaCopyProcessNeighborYFsDH(level, i);
+//        //////////////////////////////////////////////////////////////////////////
+//        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], 
+//                            para->getParH(level)->sendProcessNeighborY[i].numberOfFs,
+//                            para->getParH(level)->recvProcessNeighborY[i].f[0],
+//                            para->getParH(level)->recvProcessNeighborY[i].numberOfFs,
+//                            para->getParH(level)->sendProcessNeighborY[i].rankNeighbor);
+//        //////////////////////////////////////////////////////////////////////////
+//        para->cudaCopyProcessNeighborYFsHD(level, i);
+//        //////////////////////////////////////////////////////////////////////////
+//        SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0],
+//                          para->getParD(level)->recvProcessNeighborY[i].f[0],
+//                          para->getParD(level)->recvProcessNeighborY[i].index,
+//                          para->getParD(level)->recvProcessNeighborY[i].numberOfNodes,
+//                          para->getParD(level)->neighborX_SP, 
+//                          para->getParD(level)->neighborY_SP, 
+//                          para->getParD(level)->neighborZ_SP,
+//                          para->getParD(level)->size_Mat_SP, 
+//                          para->getParD(level)->evenOrOdd,
+//                          para->getParD(level)->numberofthreads);
+//        //////////////////////////////////////////////////////////////////////////
+//    }
 //}
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //void exchangePostCollDataYGPU27(Parameter* para, vf::gpu::Communicator& comm, int level)
 //{
-//	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-//	{
-//		//////////////////////////////////////////////////////////////////////////
-//		GetSendFsPostDev27( para->getParD(level)->d0SP.f[0],
-//							para->getParD(level)->sendProcessNeighborY[i].f[0],
-//							para->getParD(level)->sendProcessNeighborY[i].index,
-//							para->getParD(level)->sendProcessNeighborY[i].numberOfNodes,
-//							para->getParD(level)->neighborX_SP, 
-//							para->getParD(level)->neighborY_SP, 
-//							para->getParD(level)->neighborZ_SP,
-//							para->getParD(level)->size_Mat_SP, 
-//							para->getParD(level)->evenOrOdd,
-//							para->getParD(level)->numberofthreads);
-//		//////////////////////////////////////////////////////////////////////////
-//		para->cudaCopyProcessNeighborYFsDH(level, i);
-//		//////////////////////////////////////////////////////////////////////////
-//		comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], 
-//							para->getParH(level)->sendProcessNeighborY[i].numberOfFs,
-//							para->getParH(level)->recvProcessNeighborY[i].f[0],
-//							para->getParH(level)->recvProcessNeighborY[i].numberOfFs,
-//							para->getParH(level)->sendProcessNeighborY[i].rankNeighbor);
-//		//////////////////////////////////////////////////////////////////////////
-//		para->cudaCopyProcessNeighborYFsHD(level, i);
-//		//////////////////////////////////////////////////////////////////////////
-//		SetRecvFsPostDev27( para->getParD(level)->d0SP.f[0],
-//							para->getParD(level)->recvProcessNeighborY[i].f[0],
-//							para->getParD(level)->recvProcessNeighborY[i].index,
-//							para->getParD(level)->recvProcessNeighborY[i].numberOfNodes,
-//							para->getParD(level)->neighborX_SP, 
-//							para->getParD(level)->neighborY_SP, 
-//							para->getParD(level)->neighborZ_SP,
-//							para->getParD(level)->size_Mat_SP, 
-//							para->getParD(level)->evenOrOdd,
-//							para->getParD(level)->numberofthreads);
-//		//////////////////////////////////////////////////////////////////////////
-//	}
+//    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+//    {
+//        //////////////////////////////////////////////////////////////////////////
+//        GetSendFsPostDev27( para->getParD(level)->d0SP.f[0],
+//                            para->getParD(level)->sendProcessNeighborY[i].f[0],
+//                            para->getParD(level)->sendProcessNeighborY[i].index,
+//                            para->getParD(level)->sendProcessNeighborY[i].numberOfNodes,
+//                            para->getParD(level)->neighborX_SP, 
+//                            para->getParD(level)->neighborY_SP, 
+//                            para->getParD(level)->neighborZ_SP,
+//                            para->getParD(level)->size_Mat_SP, 
+//                            para->getParD(level)->evenOrOdd,
+//                            para->getParD(level)->numberofthreads);
+//        //////////////////////////////////////////////////////////////////////////
+//        para->cudaCopyProcessNeighborYFsDH(level, i);
+//        //////////////////////////////////////////////////////////////////////////
+//        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborY[i].f[0], 
+//                            para->getParH(level)->sendProcessNeighborY[i].numberOfFs,
+//                            para->getParH(level)->recvProcessNeighborY[i].f[0],
+//                            para->getParH(level)->recvProcessNeighborY[i].numberOfFs,
+//                            para->getParH(level)->sendProcessNeighborY[i].rankNeighbor);
+//        //////////////////////////////////////////////////////////////////////////
+//        para->cudaCopyProcessNeighborYFsHD(level, i);
+//        //////////////////////////////////////////////////////////////////////////
+//        SetRecvFsPostDev27( para->getParD(level)->d0SP.f[0],
+//                            para->getParD(level)->recvProcessNeighborY[i].f[0],
+//                            para->getParD(level)->recvProcessNeighborY[i].index,
+//                            para->getParD(level)->recvProcessNeighborY[i].numberOfNodes,
+//                            para->getParD(level)->neighborX_SP, 
+//                            para->getParD(level)->neighborY_SP, 
+//                            para->getParD(level)->neighborZ_SP,
+//                            para->getParD(level)->size_Mat_SP, 
+//                            para->getParD(level)->evenOrOdd,
+//                            para->getParD(level)->numberofthreads);
+//        //////////////////////////////////////////////////////////////////////////
+//    }
 //}
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //
@@ -799,82 +634,82 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //void exchangePreCollDataZGPU27(Parameter* para, vf::gpu::Communicator& comm, int level)
 //{
-//	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-//	{
-//		//////////////////////////////////////////////////////////////////////////
-//		GetSendFsPreDev27(para->getParD(level)->d0SP.f[0],
-//						  para->getParD(level)->sendProcessNeighborZ[i].f[0],
-//						  para->getParD(level)->sendProcessNeighborZ[i].index,
-//						  para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes,
-//						  para->getParD(level)->neighborX_SP, 
-//						  para->getParD(level)->neighborY_SP, 
-//						  para->getParD(level)->neighborZ_SP,
-//						  para->getParD(level)->size_Mat_SP, 
-//						  para->getParD(level)->evenOrOdd,
-//						  para->getParD(level)->numberofthreads);
-//		//////////////////////////////////////////////////////////////////////////
-//		para->cudaCopyProcessNeighborZFsDH(level, i);
-//		//////////////////////////////////////////////////////////////////////////
-//		comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], 
-//							para->getParH(level)->sendProcessNeighborZ[i].numberOfFs,
-//							para->getParH(level)->recvProcessNeighborZ[i].f[0],
-//							para->getParH(level)->recvProcessNeighborZ[i].numberOfFs,
-//							para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor);
-//		//////////////////////////////////////////////////////////////////////////
-//		para->cudaCopyProcessNeighborZFsHD(level, i);
-//		//////////////////////////////////////////////////////////////////////////
-//		SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0],
-//						  para->getParD(level)->recvProcessNeighborZ[i].f[0],
-//						  para->getParD(level)->recvProcessNeighborZ[i].index,
-//						  para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes,
-//						  para->getParD(level)->neighborX_SP, 
-//						  para->getParD(level)->neighborY_SP, 
-//						  para->getParD(level)->neighborZ_SP,
-//						  para->getParD(level)->size_Mat_SP, 
-//						  para->getParD(level)->evenOrOdd,
-//						  para->getParD(level)->numberofthreads);
-//		//////////////////////////////////////////////////////////////////////////
-//	}
+//    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+//    {
+//        //////////////////////////////////////////////////////////////////////////
+//        GetSendFsPreDev27(para->getParD(level)->d0SP.f[0],
+//                          para->getParD(level)->sendProcessNeighborZ[i].f[0],
+//                          para->getParD(level)->sendProcessNeighborZ[i].index,
+//                          para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes,
+//                          para->getParD(level)->neighborX_SP, 
+//                          para->getParD(level)->neighborY_SP, 
+//                          para->getParD(level)->neighborZ_SP,
+//                          para->getParD(level)->size_Mat_SP, 
+//                          para->getParD(level)->evenOrOdd,
+//                          para->getParD(level)->numberofthreads);
+//        //////////////////////////////////////////////////////////////////////////
+//        para->cudaCopyProcessNeighborZFsDH(level, i);
+//        //////////////////////////////////////////////////////////////////////////
+//        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], 
+//                            para->getParH(level)->sendProcessNeighborZ[i].numberOfFs,
+//                            para->getParH(level)->recvProcessNeighborZ[i].f[0],
+//                            para->getParH(level)->recvProcessNeighborZ[i].numberOfFs,
+//                            para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor);
+//        //////////////////////////////////////////////////////////////////////////
+//        para->cudaCopyProcessNeighborZFsHD(level, i);
+//        //////////////////////////////////////////////////////////////////////////
+//        SetRecvFsPreDev27(para->getParD(level)->d0SP.f[0],
+//                          para->getParD(level)->recvProcessNeighborZ[i].f[0],
+//                          para->getParD(level)->recvProcessNeighborZ[i].index,
+//                          para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes,
+//                          para->getParD(level)->neighborX_SP, 
+//                          para->getParD(level)->neighborY_SP, 
+//                          para->getParD(level)->neighborZ_SP,
+//                          para->getParD(level)->size_Mat_SP, 
+//                          para->getParD(level)->evenOrOdd,
+//                          para->getParD(level)->numberofthreads);
+//        //////////////////////////////////////////////////////////////////////////
+//    }
 //}
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 //void exchangePostCollDataZGPU27(Parameter* para, vf::gpu::Communicator& comm, int level)
 //{
-//	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-//	{
-//		//////////////////////////////////////////////////////////////////////////
-//		GetSendFsPostDev27( para->getParD(level)->d0SP.f[0],
-//							para->getParD(level)->sendProcessNeighborZ[i].f[0],
-//							para->getParD(level)->sendProcessNeighborZ[i].index,
-//							para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes,
-//							para->getParD(level)->neighborX_SP, 
-//							para->getParD(level)->neighborY_SP, 
-//							para->getParD(level)->neighborZ_SP,
-//							para->getParD(level)->size_Mat_SP, 
-//							para->getParD(level)->evenOrOdd,
-//							para->getParD(level)->numberofthreads);
-//		//////////////////////////////////////////////////////////////////////////
-//		para->cudaCopyProcessNeighborZFsDH(level, i);
-//		//////////////////////////////////////////////////////////////////////////
-//		comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], 
-//							para->getParH(level)->sendProcessNeighborZ[i].numberOfFs,
-//							para->getParH(level)->recvProcessNeighborZ[i].f[0],
-//							para->getParH(level)->recvProcessNeighborZ[i].numberOfFs,
-//							para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor);
-//		//////////////////////////////////////////////////////////////////////////
-//		para->cudaCopyProcessNeighborZFsHD(level, i);
-//		//////////////////////////////////////////////////////////////////////////
-//		SetRecvFsPostDev27( para->getParD(level)->d0SP.f[0],
-//							para->getParD(level)->recvProcessNeighborZ[i].f[0],
-//							para->getParD(level)->recvProcessNeighborZ[i].index,
-//							para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes,
-//							para->getParD(level)->neighborX_SP, 
-//							para->getParD(level)->neighborY_SP, 
-//							para->getParD(level)->neighborZ_SP,
-//							para->getParD(level)->size_Mat_SP, 
-//							para->getParD(level)->evenOrOdd,
-//							para->getParD(level)->numberofthreads);
-//		//////////////////////////////////////////////////////////////////////////
-//	}
+//    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+//    {
+//        //////////////////////////////////////////////////////////////////////////
+//        GetSendFsPostDev27( para->getParD(level)->d0SP.f[0],
+//                            para->getParD(level)->sendProcessNeighborZ[i].f[0],
+//                            para->getParD(level)->sendProcessNeighborZ[i].index,
+//                            para->getParD(level)->sendProcessNeighborZ[i].numberOfNodes,
+//                            para->getParD(level)->neighborX_SP, 
+//                            para->getParD(level)->neighborY_SP, 
+//                            para->getParD(level)->neighborZ_SP,
+//                            para->getParD(level)->size_Mat_SP, 
+//                            para->getParD(level)->evenOrOdd,
+//                            para->getParD(level)->numberofthreads);
+//        //////////////////////////////////////////////////////////////////////////
+//        para->cudaCopyProcessNeighborZFsDH(level, i);
+//        //////////////////////////////////////////////////////////////////////////
+//        comm.exchngDataGPU(para->getParH(level)->sendProcessNeighborZ[i].f[0], 
+//                            para->getParH(level)->sendProcessNeighborZ[i].numberOfFs,
+//                            para->getParH(level)->recvProcessNeighborZ[i].f[0],
+//                            para->getParH(level)->recvProcessNeighborZ[i].numberOfFs,
+//                            para->getParH(level)->sendProcessNeighborZ[i].rankNeighbor);
+//        //////////////////////////////////////////////////////////////////////////
+//        para->cudaCopyProcessNeighborZFsHD(level, i);
+//        //////////////////////////////////////////////////////////////////////////
+//        SetRecvFsPostDev27( para->getParD(level)->d0SP.f[0],
+//                            para->getParD(level)->recvProcessNeighborZ[i].f[0],
+//                            para->getParD(level)->recvProcessNeighborZ[i].index,
+//                            para->getParD(level)->recvProcessNeighborZ[i].numberOfNodes,
+//                            para->getParD(level)->neighborX_SP, 
+//                            para->getParD(level)->neighborY_SP, 
+//                            para->getParD(level)->neighborZ_SP,
+//                            para->getParD(level)->size_Mat_SP, 
+//                            para->getParD(level)->evenOrOdd,
+//                            para->getParD(level)->numberofthreads);
+//        //////////////////////////////////////////////////////////////////////////
+//    }
 //}
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -934,164 +769,164 @@ void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, Cud
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void exchangePreCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		GetSendFsPreDev27(para->getParD(level)->d27.f[0],
-						  para->getParD(level)->sendProcessNeighborADX[i].f[0],
-						  para->getParD(level)->sendProcessNeighborADX[i].index,
-						  para->getParD(level)->sendProcessNeighborADX[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborADXFsDH(level, i);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADX[i].f[0],
-							para->getParH(level)->recvProcessNeighborADX[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighborADX[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////start non blocking MPI send
-	//for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	//{
-	//	comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0],
-	//						para->getParH(level)->sendProcessNeighborADX[i].numberOfFs,
-	//						para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor);
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////Waitall
-	//if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
-	//{
-	//	comm.waitallGPU();
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0],
-						  para->getParH(level)->sendProcessNeighborADX[i].numberOfFs,
-						  para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborADXFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPreDev27(para->getParD(level)->d27.f[0],
-						  para->getParD(level)->recvProcessNeighborADX[i].f[0],
-						  para->getParD(level)->recvProcessNeighborADX[i].index,
-						  para->getParD(level)->recvProcessNeighborADX[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Device to Host
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        GetSendFsPreDev27(para->getParD(level)->d27.f[0],
+                          para->getParD(level)->sendProcessNeighborADX[i].f[0],
+                          para->getParD(level)->sendProcessNeighborADX[i].index,
+                          para->getParD(level)->sendProcessNeighborADX[i].numberOfNodes,
+                          para->getParD(level)->neighborX_SP, 
+                          para->getParD(level)->neighborY_SP, 
+                          para->getParD(level)->neighborZ_SP,
+                          para->getParD(level)->size_Mat_SP, 
+                          para->getParD(level)->evenOrOdd,
+                          para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborADXFsDH(level, i);
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start non blocking MPI receive
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADX[i].f[0],
+                            para->getParH(level)->recvProcessNeighborADX[i].numberOfFs,
+                            para->getParH(level)->recvProcessNeighborADX[i].rankNeighbor);
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////start non blocking MPI send
+    //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    //{
+    //    comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0],
+    //                        para->getParH(level)->sendProcessNeighborADX[i].numberOfFs,
+    //                        para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor);
+    //}
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////Waitall
+    //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
+    //{
+    //    comm.waitallGPU();
+    //}
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start blocking MPI send
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0],
+                          para->getParH(level)->sendProcessNeighborADX[i].numberOfFs,
+                          para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //Wait
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        comm.waitGPU(i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //reset the request array
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
+    {
+        comm.resetRequest();
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Host to Device
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        cudaManager->cudaCopyProcessNeighborADXFsHD(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        SetRecvFsPreDev27(para->getParD(level)->d27.f[0],
+                          para->getParD(level)->recvProcessNeighborADX[i].f[0],
+                          para->getParD(level)->recvProcessNeighborADX[i].index,
+                          para->getParD(level)->recvProcessNeighborADX[i].numberOfNodes,
+                          para->getParD(level)->neighborX_SP, 
+                          para->getParD(level)->neighborY_SP, 
+                          para->getParD(level)->neighborZ_SP,
+                          para->getParD(level)->size_Mat_SP, 
+                          para->getParD(level)->evenOrOdd,
+                          para->getParD(level)->numberofthreads);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void exchangePostCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		GetSendFsPostDev27(para->getParD(level)->d27.f[0],
-						   para->getParD(level)->sendProcessNeighborADX[i].f[0],
-						   para->getParD(level)->sendProcessNeighborADX[i].index,
-						   para->getParD(level)->sendProcessNeighborADX[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborADXFsDH(level, i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADX[i].f[0],
-							para->getParH(level)->recvProcessNeighborADX[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighborADX[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////start non blocking MPI send
-	//for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	//{
-	//	comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0],
-	//						para->getParH(level)->sendProcessNeighborADX[i].numberOfFs,
-	//						para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor);
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////Waitall
-	//if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
-	//{
-	//	comm.waitallGPU();
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0],
-						  para->getParH(level)->sendProcessNeighborADX[i].numberOfFs,
-						  para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborADXFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPostDev27(para->getParD(level)->d27.f[0],
-						   para->getParD(level)->recvProcessNeighborADX[i].f[0],
-						   para->getParD(level)->recvProcessNeighborADX[i].index,
-						   para->getParD(level)->recvProcessNeighborADX[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Device to Host
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        GetSendFsPostDev27(para->getParD(level)->d27.f[0],
+                           para->getParD(level)->sendProcessNeighborADX[i].f[0],
+                           para->getParD(level)->sendProcessNeighborADX[i].index,
+                           para->getParD(level)->sendProcessNeighborADX[i].numberOfNodes,
+                           para->getParD(level)->neighborX_SP, 
+                           para->getParD(level)->neighborY_SP, 
+                           para->getParD(level)->neighborZ_SP,
+                           para->getParD(level)->size_Mat_SP, 
+                           para->getParD(level)->evenOrOdd,
+                           para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborADXFsDH(level, i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start non blocking MPI receive
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADX[i].f[0],
+                            para->getParH(level)->recvProcessNeighborADX[i].numberOfFs,
+                            para->getParH(level)->recvProcessNeighborADX[i].rankNeighbor);
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////start non blocking MPI send
+    //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    //{
+    //    comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0],
+    //                        para->getParH(level)->sendProcessNeighborADX[i].numberOfFs,
+    //                        para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor);
+    //}
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////Waitall
+    //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
+    //{
+    //    comm.waitallGPU();
+    //}
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start blocking MPI send
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADX[i].f[0],
+                          para->getParH(level)->sendProcessNeighborADX[i].numberOfFs,
+                          para->getParH(level)->sendProcessNeighborADX[i].rankNeighbor);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //Wait
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        comm.waitGPU(i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //reset the request array
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
+    {
+        comm.resetRequest();
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Host to Device
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        cudaManager->cudaCopyProcessNeighborADXFsHD(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        SetRecvFsPostDev27(para->getParD(level)->d27.f[0],
+                           para->getParD(level)->recvProcessNeighborADX[i].f[0],
+                           para->getParD(level)->recvProcessNeighborADX[i].index,
+                           para->getParD(level)->recvProcessNeighborADX[i].numberOfNodes,
+                           para->getParD(level)->neighborX_SP, 
+                           para->getParD(level)->neighborY_SP, 
+                           para->getParD(level)->neighborZ_SP,
+                           para->getParD(level)->size_Mat_SP, 
+                           para->getParD(level)->evenOrOdd,
+                           para->getParD(level)->numberofthreads);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1103,164 +938,164 @@ void exchangePostCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm,
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void exchangePreCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		GetSendFsPreDev27(para->getParD(level)->d27.f[0],
-						  para->getParD(level)->sendProcessNeighborADY[i].f[0],
-						  para->getParD(level)->sendProcessNeighborADY[i].index,
-						  para->getParD(level)->sendProcessNeighborADY[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborADYFsDH(level, i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADY[i].f[0],
-							para->getParH(level)->recvProcessNeighborADY[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighborADY[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////start non blocking MPI send
-	//for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	//{
-	//	comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0],
-	//						para->getParH(level)->sendProcessNeighborADY[i].numberOfFs,
-	//						para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor);
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////Waitall
-	//if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
-	//{
-	//	comm.waitallGPU();
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0],
-						  para->getParH(level)->sendProcessNeighborADY[i].numberOfFs,
-			              para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborADYFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPreDev27(para->getParD(level)->d27.f[0],
-						  para->getParD(level)->recvProcessNeighborADY[i].f[0],
-						  para->getParD(level)->recvProcessNeighborADY[i].index,
-						  para->getParD(level)->recvProcessNeighborADY[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Device to Host
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        GetSendFsPreDev27(para->getParD(level)->d27.f[0],
+                          para->getParD(level)->sendProcessNeighborADY[i].f[0],
+                          para->getParD(level)->sendProcessNeighborADY[i].index,
+                          para->getParD(level)->sendProcessNeighborADY[i].numberOfNodes,
+                          para->getParD(level)->neighborX_SP, 
+                          para->getParD(level)->neighborY_SP, 
+                          para->getParD(level)->neighborZ_SP,
+                          para->getParD(level)->size_Mat_SP, 
+                          para->getParD(level)->evenOrOdd,
+                          para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborADYFsDH(level, i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start non blocking MPI receive
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADY[i].f[0],
+                            para->getParH(level)->recvProcessNeighborADY[i].numberOfFs,
+                            para->getParH(level)->recvProcessNeighborADY[i].rankNeighbor);
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////start non blocking MPI send
+    //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    //{
+    //    comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0],
+    //                        para->getParH(level)->sendProcessNeighborADY[i].numberOfFs,
+    //                        para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor);
+    //}
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////Waitall
+    //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
+    //{
+    //    comm.waitallGPU();
+    //}
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start blocking MPI send
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0],
+                          para->getParH(level)->sendProcessNeighborADY[i].numberOfFs,
+                          para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //Wait
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        comm.waitGPU(i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //reset the request array
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
+    {
+        comm.resetRequest();
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Host to Device
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        cudaManager->cudaCopyProcessNeighborADYFsHD(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        SetRecvFsPreDev27(para->getParD(level)->d27.f[0],
+                          para->getParD(level)->recvProcessNeighborADY[i].f[0],
+                          para->getParD(level)->recvProcessNeighborADY[i].index,
+                          para->getParD(level)->recvProcessNeighborADY[i].numberOfNodes,
+                          para->getParD(level)->neighborX_SP, 
+                          para->getParD(level)->neighborY_SP, 
+                          para->getParD(level)->neighborZ_SP,
+                          para->getParD(level)->size_Mat_SP, 
+                          para->getParD(level)->evenOrOdd,
+                          para->getParD(level)->numberofthreads);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void exchangePostCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		GetSendFsPostDev27(para->getParD(level)->d27.f[0],
-						   para->getParD(level)->sendProcessNeighborADY[i].f[0],
-						   para->getParD(level)->sendProcessNeighborADY[i].index,
-						   para->getParD(level)->sendProcessNeighborADY[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborADYFsDH(level, i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADY[i].f[0],
-							para->getParH(level)->recvProcessNeighborADY[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighborADY[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////start non blocking MPI send
-	//for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	//{
-	//	comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0],
-	//						para->getParH(level)->sendProcessNeighborADY[i].numberOfFs,
-	//						para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor);
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////Waitall
-	//if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
-	//{
-	//	comm.waitallGPU();
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0],
-						  para->getParH(level)->sendProcessNeighborADY[i].numberOfFs,
-			              para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborADYFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPostDev27(para->getParD(level)->d27.f[0],
-						   para->getParD(level)->recvProcessNeighborADY[i].f[0],
-						   para->getParD(level)->recvProcessNeighborADY[i].index,
-						   para->getParD(level)->recvProcessNeighborADY[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Device to Host
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        GetSendFsPostDev27(para->getParD(level)->d27.f[0],
+                           para->getParD(level)->sendProcessNeighborADY[i].f[0],
+                           para->getParD(level)->sendProcessNeighborADY[i].index,
+                           para->getParD(level)->sendProcessNeighborADY[i].numberOfNodes,
+                           para->getParD(level)->neighborX_SP, 
+                           para->getParD(level)->neighborY_SP, 
+                           para->getParD(level)->neighborZ_SP,
+                           para->getParD(level)->size_Mat_SP, 
+                           para->getParD(level)->evenOrOdd,
+                           para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborADYFsDH(level, i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start non blocking MPI receive
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADY[i].f[0],
+                            para->getParH(level)->recvProcessNeighborADY[i].numberOfFs,
+                            para->getParH(level)->recvProcessNeighborADY[i].rankNeighbor);
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////start non blocking MPI send
+    //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    //{
+    //    comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0],
+    //                        para->getParH(level)->sendProcessNeighborADY[i].numberOfFs,
+    //                        para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor);
+    //}
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////Waitall
+    //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
+    //{
+    //    comm.waitallGPU();
+    //}
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start blocking MPI send
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADY[i].f[0],
+                          para->getParH(level)->sendProcessNeighborADY[i].numberOfFs,
+                          para->getParH(level)->sendProcessNeighborADY[i].rankNeighbor);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //Wait
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        comm.waitGPU(i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //reset the request array
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
+    {
+        comm.resetRequest();
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Host to Device
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        cudaManager->cudaCopyProcessNeighborADYFsHD(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        SetRecvFsPostDev27(para->getParD(level)->d27.f[0],
+                           para->getParD(level)->recvProcessNeighborADY[i].f[0],
+                           para->getParD(level)->recvProcessNeighborADY[i].index,
+                           para->getParD(level)->recvProcessNeighborADY[i].numberOfNodes,
+                           para->getParD(level)->neighborX_SP, 
+                           para->getParD(level)->neighborY_SP, 
+                           para->getParD(level)->neighborZ_SP,
+                           para->getParD(level)->size_Mat_SP, 
+                           para->getParD(level)->evenOrOdd,
+                           para->getParD(level)->numberofthreads);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1272,164 +1107,164 @@ void exchangePostCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm,
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void exchangePreCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		GetSendFsPreDev27(para->getParD(level)->d27.f[0],
-						  para->getParD(level)->sendProcessNeighborADZ[i].f[0],
-						  para->getParD(level)->sendProcessNeighborADZ[i].index,
-						  para->getParD(level)->sendProcessNeighborADZ[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborADZFsDH(level, i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADZ[i].f[0],
-							para->getParH(level)->recvProcessNeighborADZ[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighborADZ[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////start non blocking MPI send
-	//for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	//{
-	//	comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0],
-	//						para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs,
-	//						para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor);
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////Waitall
-	//if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
-	//{
-	//	comm.waitallGPU();
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0],
-						  para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs,
-						  para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborADZFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPreDev27(para->getParD(level)->d27.f[0],
-						  para->getParD(level)->recvProcessNeighborADZ[i].f[0],
-						  para->getParD(level)->recvProcessNeighborADZ[i].index,
-						  para->getParD(level)->recvProcessNeighborADZ[i].numberOfNodes,
-						  para->getParD(level)->neighborX_SP, 
-						  para->getParD(level)->neighborY_SP, 
-						  para->getParD(level)->neighborZ_SP,
-						  para->getParD(level)->size_Mat_SP, 
-						  para->getParD(level)->evenOrOdd,
-						  para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Device to Host
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        GetSendFsPreDev27(para->getParD(level)->d27.f[0],
+                          para->getParD(level)->sendProcessNeighborADZ[i].f[0],
+                          para->getParD(level)->sendProcessNeighborADZ[i].index,
+                          para->getParD(level)->sendProcessNeighborADZ[i].numberOfNodes,
+                          para->getParD(level)->neighborX_SP, 
+                          para->getParD(level)->neighborY_SP, 
+                          para->getParD(level)->neighborZ_SP,
+                          para->getParD(level)->size_Mat_SP, 
+                          para->getParD(level)->evenOrOdd,
+                          para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborADZFsDH(level, i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start non blocking MPI receive
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADZ[i].f[0],
+                            para->getParH(level)->recvProcessNeighborADZ[i].numberOfFs,
+                            para->getParH(level)->recvProcessNeighborADZ[i].rankNeighbor);
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////start non blocking MPI send
+    //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    //{
+    //    comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0],
+    //                        para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs,
+    //                        para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor);
+    //}
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////Waitall
+    //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
+    //{
+    //    comm.waitallGPU();
+    //}
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start blocking MPI send
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0],
+                          para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs,
+                          para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //Wait
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        comm.waitGPU(i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //reset the request array
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
+    {
+        comm.resetRequest();
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Host to Device
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        cudaManager->cudaCopyProcessNeighborADZFsHD(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        SetRecvFsPreDev27(para->getParD(level)->d27.f[0],
+                          para->getParD(level)->recvProcessNeighborADZ[i].f[0],
+                          para->getParD(level)->recvProcessNeighborADZ[i].index,
+                          para->getParD(level)->recvProcessNeighborADZ[i].numberOfNodes,
+                          para->getParD(level)->neighborX_SP, 
+                          para->getParD(level)->neighborY_SP, 
+                          para->getParD(level)->neighborZ_SP,
+                          para->getParD(level)->size_Mat_SP, 
+                          para->getParD(level)->evenOrOdd,
+                          para->getParD(level)->numberofthreads);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void exchangePostCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		GetSendFsPostDev27(para->getParD(level)->d27.f[0],
-						   para->getParD(level)->sendProcessNeighborADZ[i].f[0],
-						   para->getParD(level)->sendProcessNeighborADZ[i].index,
-						   para->getParD(level)->sendProcessNeighborADZ[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborADZFsDH(level, i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADZ[i].f[0],
-							para->getParH(level)->recvProcessNeighborADZ[i].numberOfFs,
-							para->getParH(level)->recvProcessNeighborADZ[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////start non blocking MPI send
-	//for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	//{
-	//	comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0],
-	//						para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs,
-	//						para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor);
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	////Waitall
-	//if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
-	//{
-	//	comm.waitallGPU();
-	//}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0],
-						  para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs,
-						  para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborADZFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		SetRecvFsPostDev27(para->getParD(level)->d27.f[0],
-						   para->getParD(level)->recvProcessNeighborADZ[i].f[0],
-						   para->getParD(level)->recvProcessNeighborADZ[i].index,
-						   para->getParD(level)->recvProcessNeighborADZ[i].numberOfNodes,
-						   para->getParD(level)->neighborX_SP, 
-						   para->getParD(level)->neighborY_SP, 
-						   para->getParD(level)->neighborZ_SP,
-						   para->getParD(level)->size_Mat_SP, 
-						   para->getParD(level)->evenOrOdd,
-						   para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Device to Host
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        GetSendFsPostDev27(para->getParD(level)->d27.f[0],
+                           para->getParD(level)->sendProcessNeighborADZ[i].f[0],
+                           para->getParD(level)->sendProcessNeighborADZ[i].index,
+                           para->getParD(level)->sendProcessNeighborADZ[i].numberOfNodes,
+                           para->getParD(level)->neighborX_SP, 
+                           para->getParD(level)->neighborY_SP, 
+                           para->getParD(level)->neighborZ_SP,
+                           para->getParD(level)->size_Mat_SP, 
+                           para->getParD(level)->evenOrOdd,
+                           para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborADZFsDH(level, i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start non blocking MPI receive
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        comm.nbRecvDataGPU(para->getParH(level)->recvProcessNeighborADZ[i].f[0],
+                            para->getParH(level)->recvProcessNeighborADZ[i].numberOfFs,
+                            para->getParH(level)->recvProcessNeighborADZ[i].rankNeighbor);
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////start non blocking MPI send
+    //for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    //{
+    //    comm.nbSendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0],
+    //                        para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs,
+    //                        para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor);
+    //}
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////Waitall
+    //if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
+    //{
+    //    comm.waitallGPU();
+    //}
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start blocking MPI send
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        comm.sendDataGPU(para->getParH(level)->sendProcessNeighborADZ[i].f[0],
+                          para->getParH(level)->sendProcessNeighborADZ[i].numberOfFs,
+                          para->getParH(level)->sendProcessNeighborADZ[i].rankNeighbor);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //Wait
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        comm.waitGPU(i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //reset the request array
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
+    {
+        comm.resetRequest();
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Host to Device
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        cudaManager->cudaCopyProcessNeighborADZFsHD(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        SetRecvFsPostDev27(para->getParD(level)->d27.f[0],
+                           para->getParD(level)->recvProcessNeighborADZ[i].f[0],
+                           para->getParD(level)->recvProcessNeighborADZ[i].index,
+                           para->getParD(level)->recvProcessNeighborADZ[i].numberOfNodes,
+                           para->getParD(level)->neighborX_SP, 
+                           para->getParD(level)->neighborY_SP, 
+                           para->getParD(level)->neighborZ_SP,
+                           para->getParD(level)->size_Mat_SP, 
+                           para->getParD(level)->evenOrOdd,
+                           para->getParD(level)->numberofthreads);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1488,73 +1323,73 @@ void exchangePostCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm,
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void exchangeCollDataF3XGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		getSendGsDevF3(
-			para->getParD(level)->g6.g[0],
-			para->getParD(level)->sendProcessNeighborF3X[i].g[0],
-			para->getParD(level)->sendProcessNeighborF3X[i].index,
-			para->getParD(level)->sendProcessNeighborF3X[i].numberOfNodes,
-			para->getParD(level)->neighborX_SP,
-			para->getParD(level)->neighborY_SP,
-			para->getParD(level)->neighborZ_SP,
-			para->getParD(level)->size_Mat_SP,
-			para->getParD(level)->evenOrOdd,
-			para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborF3XFsDH(level, i);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(
-			para->getParH(level)->recvProcessNeighborF3X[i].g[0],
-			para->getParH(level)->recvProcessNeighborF3X[i].numberOfGs,
-			para->getParH(level)->recvProcessNeighborF3X[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.sendDataGPU(
-			para->getParH(level)->sendProcessNeighborF3X[i].g[0],
-			para->getParH(level)->sendProcessNeighborF3X[i].numberOfGs,
-			para->getParH(level)->sendProcessNeighborF3X[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborF3XFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		setRecvGsDevF3(
-			para->getParD(level)->g6.g[0],
-			para->getParD(level)->recvProcessNeighborF3X[i].g[0],
-			para->getParD(level)->recvProcessNeighborF3X[i].index,
-			para->getParD(level)->recvProcessNeighborF3X[i].numberOfNodes,
-			para->getParD(level)->neighborX_SP,
-			para->getParD(level)->neighborY_SP,
-			para->getParD(level)->neighborZ_SP,
-			para->getParD(level)->size_Mat_SP,
-			para->getParD(level)->evenOrOdd,
-			para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Device to Host
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        getSendGsDevF3(
+            para->getParD(level)->g6.g[0],
+            para->getParD(level)->sendProcessNeighborF3X[i].g[0],
+            para->getParD(level)->sendProcessNeighborF3X[i].index,
+            para->getParD(level)->sendProcessNeighborF3X[i].numberOfNodes,
+            para->getParD(level)->neighborX_SP,
+            para->getParD(level)->neighborY_SP,
+            para->getParD(level)->neighborZ_SP,
+            para->getParD(level)->size_Mat_SP,
+            para->getParD(level)->evenOrOdd,
+            para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborF3XFsDH(level, i);
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start non blocking MPI receive
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        comm.nbRecvDataGPU(
+            para->getParH(level)->recvProcessNeighborF3X[i].g[0],
+            para->getParH(level)->recvProcessNeighborF3X[i].numberOfGs,
+            para->getParH(level)->recvProcessNeighborF3X[i].rankNeighbor);
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start blocking MPI send
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        comm.sendDataGPU(
+            para->getParH(level)->sendProcessNeighborF3X[i].g[0],
+            para->getParH(level)->sendProcessNeighborF3X[i].numberOfGs,
+            para->getParH(level)->sendProcessNeighborF3X[i].rankNeighbor);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //Wait
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        comm.waitGPU(i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //reset the request array
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")))
+    {
+        comm.resetRequest();
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Host to Device
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsX(level, "send")); i++)
+    {
+        cudaManager->cudaCopyProcessNeighborF3XFsHD(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        setRecvGsDevF3(
+            para->getParD(level)->g6.g[0],
+            para->getParD(level)->recvProcessNeighborF3X[i].g[0],
+            para->getParD(level)->recvProcessNeighborF3X[i].index,
+            para->getParD(level)->recvProcessNeighborF3X[i].numberOfNodes,
+            para->getParD(level)->neighborX_SP,
+            para->getParD(level)->neighborY_SP,
+            para->getParD(level)->neighborZ_SP,
+            para->getParD(level)->size_Mat_SP,
+            para->getParD(level)->evenOrOdd,
+            para->getParD(level)->numberofthreads);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1566,73 +1401,73 @@ void exchangeCollDataF3XGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMe
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void exchangeCollDataF3YGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		getSendGsDevF3(
-			para->getParD(level)->g6.g[0],
-			para->getParD(level)->sendProcessNeighborF3Y[i].g[0],
-			para->getParD(level)->sendProcessNeighborF3Y[i].index,
-			para->getParD(level)->sendProcessNeighborF3Y[i].numberOfNodes,
-			para->getParD(level)->neighborX_SP,
-			para->getParD(level)->neighborY_SP,
-			para->getParD(level)->neighborZ_SP,
-			para->getParD(level)->size_Mat_SP,
-			para->getParD(level)->evenOrOdd,
-			para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborF3YFsDH(level, i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(
-			para->getParH(level)->recvProcessNeighborF3Y[i].g[0],
-			para->getParH(level)->recvProcessNeighborF3Y[i].numberOfGs,
-			para->getParH(level)->recvProcessNeighborF3Y[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.sendDataGPU(
-			para->getParH(level)->sendProcessNeighborF3Y[i].g[0],
-			para->getParH(level)->sendProcessNeighborF3Y[i].numberOfGs,
-			para->getParH(level)->sendProcessNeighborF3Y[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborF3YFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		setRecvGsDevF3(
-			para->getParD(level)->g6.g[0],
-			para->getParD(level)->recvProcessNeighborF3Y[i].g[0],
-			para->getParD(level)->recvProcessNeighborF3Y[i].index,
-			para->getParD(level)->recvProcessNeighborF3Y[i].numberOfNodes,
-			para->getParD(level)->neighborX_SP,
-			para->getParD(level)->neighborY_SP,
-			para->getParD(level)->neighborZ_SP,
-			para->getParD(level)->size_Mat_SP,
-			para->getParD(level)->evenOrOdd,
-			para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Device to Host
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        getSendGsDevF3(
+            para->getParD(level)->g6.g[0],
+            para->getParD(level)->sendProcessNeighborF3Y[i].g[0],
+            para->getParD(level)->sendProcessNeighborF3Y[i].index,
+            para->getParD(level)->sendProcessNeighborF3Y[i].numberOfNodes,
+            para->getParD(level)->neighborX_SP,
+            para->getParD(level)->neighborY_SP,
+            para->getParD(level)->neighborZ_SP,
+            para->getParD(level)->size_Mat_SP,
+            para->getParD(level)->evenOrOdd,
+            para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborF3YFsDH(level, i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start non blocking MPI receive
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        comm.nbRecvDataGPU(
+            para->getParH(level)->recvProcessNeighborF3Y[i].g[0],
+            para->getParH(level)->recvProcessNeighborF3Y[i].numberOfGs,
+            para->getParH(level)->recvProcessNeighborF3Y[i].rankNeighbor);
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start blocking MPI send
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        comm.sendDataGPU(
+            para->getParH(level)->sendProcessNeighborF3Y[i].g[0],
+            para->getParH(level)->sendProcessNeighborF3Y[i].numberOfGs,
+            para->getParH(level)->sendProcessNeighborF3Y[i].rankNeighbor);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //Wait
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        comm.waitGPU(i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //reset the request array
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")))
+    {
+        comm.resetRequest();
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Host to Device
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsY(level, "send")); i++)
+    {
+        cudaManager->cudaCopyProcessNeighborF3YFsHD(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        setRecvGsDevF3(
+            para->getParD(level)->g6.g[0],
+            para->getParD(level)->recvProcessNeighborF3Y[i].g[0],
+            para->getParD(level)->recvProcessNeighborF3Y[i].index,
+            para->getParD(level)->recvProcessNeighborF3Y[i].numberOfNodes,
+            para->getParD(level)->neighborX_SP,
+            para->getParD(level)->neighborY_SP,
+            para->getParD(level)->neighborZ_SP,
+            para->getParD(level)->size_Mat_SP,
+            para->getParD(level)->evenOrOdd,
+            para->getParD(level)->numberofthreads);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1644,92 +1479,72 @@ void exchangeCollDataF3YGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMe
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void exchangeCollDataF3ZGPU(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level)
 {
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Device to Host
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		getSendGsDevF3(
-			para->getParD(level)->g6.g[0],
-			para->getParD(level)->sendProcessNeighborF3Z[i].g[0],
-			para->getParD(level)->sendProcessNeighborF3Z[i].index,
-			para->getParD(level)->sendProcessNeighborF3Z[i].numberOfNodes,
-			para->getParD(level)->neighborX_SP,
-			para->getParD(level)->neighborY_SP,
-			para->getParD(level)->neighborZ_SP,
-			para->getParD(level)->size_Mat_SP,
-			para->getParD(level)->evenOrOdd,
-			para->getParD(level)->numberofthreads);
-		//////////////////////////////////////////////////////////////////////////
-		cudaManager->cudaCopyProcessNeighborF3ZFsDH(level, i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start non blocking MPI receive
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.nbRecvDataGPU(
-			para->getParH(level)->recvProcessNeighborF3Z[i].g[0],
-			para->getParH(level)->recvProcessNeighborF3Z[i].numberOfGs,
-			para->getParH(level)->recvProcessNeighborF3Z[i].rankNeighbor);
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//start blocking MPI send
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.sendDataGPU(
-			para->getParH(level)->sendProcessNeighborF3Z[i].g[0],
-			para->getParH(level)->sendProcessNeighborF3Z[i].numberOfGs,
-			para->getParH(level)->sendProcessNeighborF3Z[i].rankNeighbor);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//Wait
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		comm.waitGPU(i);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//reset the request array
-	if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
-	{
-		comm.resetRequest();
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	//copy Host to Device
-	for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
-	{
-		cudaManager->cudaCopyProcessNeighborF3ZFsHD(level, i);
-		//////////////////////////////////////////////////////////////////////////
-		setRecvGsDevF3(
-			para->getParD(level)->g6.g[0],
-			para->getParD(level)->recvProcessNeighborF3Z[i].g[0],
-			para->getParD(level)->recvProcessNeighborF3Z[i].index,
-			para->getParD(level)->recvProcessNeighborF3Z[i].numberOfNodes,
-			para->getParD(level)->neighborX_SP,
-			para->getParD(level)->neighborY_SP,
-			para->getParD(level)->neighborZ_SP,
-			para->getParD(level)->size_Mat_SP,
-			para->getParD(level)->evenOrOdd,
-			para->getParD(level)->numberofthreads);
-	}
-	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Device to Host
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        getSendGsDevF3(
+            para->getParD(level)->g6.g[0],
+            para->getParD(level)->sendProcessNeighborF3Z[i].g[0],
+            para->getParD(level)->sendProcessNeighborF3Z[i].index,
+            para->getParD(level)->sendProcessNeighborF3Z[i].numberOfNodes,
+            para->getParD(level)->neighborX_SP,
+            para->getParD(level)->neighborY_SP,
+            para->getParD(level)->neighborZ_SP,
+            para->getParD(level)->size_Mat_SP,
+            para->getParD(level)->evenOrOdd,
+            para->getParD(level)->numberofthreads);
+        //////////////////////////////////////////////////////////////////////////
+        cudaManager->cudaCopyProcessNeighborF3ZFsDH(level, i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start non blocking MPI receive
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        comm.nbRecvDataGPU(
+            para->getParH(level)->recvProcessNeighborF3Z[i].g[0],
+            para->getParH(level)->recvProcessNeighborF3Z[i].numberOfGs,
+            para->getParH(level)->recvProcessNeighborF3Z[i].rankNeighbor);
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //start blocking MPI send
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        comm.sendDataGPU(
+            para->getParH(level)->sendProcessNeighborF3Z[i].g[0],
+            para->getParH(level)->sendProcessNeighborF3Z[i].numberOfGs,
+            para->getParH(level)->sendProcessNeighborF3Z[i].rankNeighbor);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //Wait
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        comm.waitGPU(i);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //reset the request array
+    if (0 < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")))
+    {
+        comm.resetRequest();
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //copy Host to Device
+    for (unsigned int i = 0; i < (unsigned int)(para->getNumberOfProcessNeighborsZ(level, "send")); i++)
+    {
+        cudaManager->cudaCopyProcessNeighborF3ZFsHD(level, i);
+        //////////////////////////////////////////////////////////////////////////
+        setRecvGsDevF3(
+            para->getParD(level)->g6.g[0],
+            para->getParD(level)->recvProcessNeighborF3Z[i].g[0],
+            para->getParD(level)->recvProcessNeighborF3Z[i].index,
+            para->getParD(level)->recvProcessNeighborF3Z[i].numberOfNodes,
+            para->getParD(level)->neighborX_SP,
+            para->getParD(level)->neighborY_SP,
+            para->getParD(level)->neighborZ_SP,
+            para->getParD(level)->size_Mat_SP,
+            para->getParD(level)->evenOrOdd,
+            para->getParD(level)->numberofthreads);
+    }
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 }
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h
index 82662cdc55e8b0ff5f4afe7d31a6563579b45559..c6116ea37e6a6b17c7c3ded73d3e8478f07c41da 100644
--- a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h
+++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27.h
@@ -1,39 +1,153 @@
 #ifndef EXCHANGEDATA27_H
 #define EXCHANGEDATA27_H
 
-#include "LBM/LB.h"
-#include "GPU/GPU_Interface.h"
-#include "Parameter/Parameter.h"
 #include "Communication/Communicator.h"
 #include "GPU/CudaMemoryManager.h"
+#include "GPU/GPU_Interface.h"
+#include "LBM/LB.h"
+#include "Parameter/Parameter.h"
+
+//! \file ExchangeData27.h
+//! \ingroup GPU
+//! \author Martin Schoenherr, Anna Wellmann
+//! \brief routines for data exchange when running simulations on multiple GPUs
+
+//////////////////////////////////////////////////////////////////////////
+// 1D domain decomposition
+extern "C" void exchangePreCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                         int level);
+extern "C" void exchangePostCollDataGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                          int level);
+//////////////////////////////////////////////////////////////////////////
+// 3D domain decomposition
+
+// functions used for all directions
+
+//! \brief collect the send nodes in a buffer on the gpu
+extern "C" void collectNodesInSendBufferGPU(Parameter *para, int level, int streamIndex,
+                                            std::vector<ProcessNeighbor27> *sendProcessNeighbor,
+                                            unsigned int numberOfSendProcessNeighbors);
+//! \brief distribute the receive nodes from the buffer on the gpu
+extern "C" void scatterNodesFromRecvBufferGPU(Parameter *para, int level, int streamIndex,
+                                              std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
+                                              unsigned int numberOfRecvProcessNeighbors);
+//! \brief copy nodes which are part of the communication in multiple directions
+//! \details The nodes are copied from the receive buffer in one direction to the send buffer in another direction. The
+//! copy operation is conducted on the cpu. 
+//! \ref see master thesis of Anna Wellmann (p. 56f: "Communication Hiding bei
+//! der Verwendung eines uniformen Simulationsgitters") 
+//! \param edgeNodes determines from where to where the nodes are
+//! copied 
+//! \param recvProcessNeighborHost is a reference to the receive buffer on the host, nodes are copied from here
+//! \param sendProcessNeighborHost is a reference to the send buffer on the host, nodes are copied to here
+extern "C" void copyEdgeNodes(std::vector<LBMSimulationParameter::EdgeNodePositions> &edgeNodes,
+                              std::vector<ProcessNeighbor27> &recvProcessNeighborHost,
+                              std::vector<ProcessNeighbor27> &sendProcessNeighborHost);
 
 //////////////////////////////////////////////////////////////////////////
-//1D domain decomposition
-extern "C" void exchangePreCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePostCollDataGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-//////////////////////////////////////////////////////////////////////////
-//3D domain decomposition
-extern "C" void exchangePreCollDataXGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePreCollDataYGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePreCollDataZGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePostCollDataXGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePostCollDataYGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePostCollDataZGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-//////////////////////////////////////////////////////////////////////////
-//3D domain decomposition convection diffusion
-extern "C" void exchangePreCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePreCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePreCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePostCollDataADXGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePostCollDataADYGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangePostCollDataADZGPU27(Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-//////////////////////////////////////////////////////////////////////////
-//3D domain decomposition F3 - K18/K20
-extern "C" void exchangeCollDataF3XGPU( Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangeCollDataF3YGPU( Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-extern "C" void exchangeCollDataF3ZGPU( Parameter* para, vf::gpu::Communicator& comm, CudaMemoryManager* cudaManager, int level);
-//////////////////////////////////////////////////////////////////////////
-extern "C" void barrierGPU(vf::gpu::Communicator& comm);
+// x
+
+//! \brief collect the send nodes for communication in the x direction in a buffer on the gpu
+//! \details needed to exchange all nodes, used in the communication after collision step
+extern "C" void prepareExchangeCollDataXGPU27AllNodes(Parameter *para, int level, int streamIndex);
+//! \brief collect the send nodes for communication in the x direction in a buffer on the gpu
+//! \details Only exchange nodes which are part of the interpolation process on refined grids. This function is used in
+//! the exchange which takes place after the interpolation fine to coarse and before the interpolation coarse to fine.
+//! \ref see master thesis of Anna Wellmann
+extern "C" void prepareExchangeCollDataXGPU27AfterFtoC(Parameter *para, int level, int streamIndex);
+//! \brief exchange routine in x direction for simulations on multiple gpus
+//! \details send and receive the nodes from the communication buffers on the gpus
+//! \param Communicator is needed for the communication between the processes with mpi
+//! \param CudaMemoryManager is needed for moving the data between host and device
+//! \param streamIndex is the index of a CUDA Stream, which is needed for communication hiding
+//! \param sendProcessNeighborDev, recvProcessNeighborDev, sendProcessNeighborHost, recvProcessNeighborHost are pointers
+//! to the send and receive arrays, both on the device and the host
+extern "C" void exchangeCollDataXGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                       int level, int streamIndex,
+                                       std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
+                                       std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
+                                       std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
+                                       std::vector<ProcessNeighbor27> *recvProcessNeighborHost);
+//! \brief calls exchangeCollDataXGPU27() for exchanging all nodes
+//! \details used in the communication after collision step
+extern "C" void exchangeCollDataXGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm,
+                                               CudaMemoryManager *cudaManager, int level, int streamIndex);
+//! \brief calls exchangeCollDataXGPU27() for exchanging the nodes, which are part of the communication between the two
+//! interpolation processes on refined grids \details Only exchange nodes which are part of the interpolation process on
+//! refined grids. This function is used in the exchange which takes place after the interpolation fine to coarse and
+//! before the interpolation coarse to fine. \ref see master thesis of Anna Wellmann
+extern "C" void exchangeCollDataXGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm,
+                                                CudaMemoryManager *cudaManager, int level, int streamIndex);
+//! \brief distribute the receive nodes (x direction) from the buffer on the gpu
+//! \details needed to exchange all nodes, used in the communication after collision step
+extern "C" void scatterNodesFromRecvBufferXGPU27AllNodes(Parameter *para, int level, int streamIndex);
+//! \brief distribute the receive nodes (x direction) from the buffer on the gpu
+//! \details Only exchange nodes which are part of the interpolation process on refined grids. This function is used in
+//! the exchange which takes place after the interpolation fine to coarse and before the interpolation coarse to fine.
+//! \ref see master thesis of Anna Wellmann
+extern "C" void scatterNodesFromRecvBufferXGPU27AfterFtoC(Parameter *para, int level, int streamIndex);
+
+//////////////////////////////////////////////////////////////////////////
+// y
+
+extern "C" void prepareExchangeCollDataYGPU27AllNodes(Parameter *para, int level, int streamIndex);
+extern "C" void prepareExchangeCollDataYGPU27AfterFtoC(Parameter *para, int level, int streamIndex);
+
+extern "C" void exchangeCollDataYGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                       int level, int streamIndex,
+                                       std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
+                                       std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
+                                       std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
+                                       std::vector<ProcessNeighbor27> *recvProcessNeighborHos);
+extern "C" void exchangeCollDataYGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm,
+                                               CudaMemoryManager *cudaManager, int level, int streamIndex);
+extern "C" void exchangeCollDataYGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm,
+                                                CudaMemoryManager *cudaManager, int level, int streamIndex);
+extern "C" void scatterNodesFromRecvBufferYGPU27AllNodes(Parameter *para, int level, int streamIndex);
+extern "C" void scatterNodesFromRecvBufferYGPU27AfterFtoC(Parameter *para, int level, int streamIndex);
+
+// z
+extern "C" void prepareExchangeCollDataZGPU27AllNodes(Parameter *para, int level, int streamIndex);
+extern "C" void prepareExchangeCollDataZGPU27AfterFtoC(Parameter *para, int level, int streamIndex);
+
+extern "C" void exchangeCollDataZGPU27(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                       int level, int streamIndex,
+                                       std::vector<ProcessNeighbor27> *sendProcessNeighborDev,
+                                       std::vector<ProcessNeighbor27> *recvProcessNeighborDev,
+                                       std::vector<ProcessNeighbor27> *sendProcessNeighborHost,
+                                       std::vector<ProcessNeighbor27> *recvProcessNeighborHost);
+extern "C" void exchangeCollDataZGPU27AllNodes(Parameter *para, vf::gpu::Communicator &comm,
+                                               CudaMemoryManager *cudaManager, int level, int streamIndex);
+extern "C" void exchangeCollDataZGPU27AfterFtoC(Parameter *para, vf::gpu::Communicator &comm,
+                                                CudaMemoryManager *cudaManager, int level, int streamIndex);
+
+extern "C" void scatterNodesFromRecvBufferZGPU27AllNodes(Parameter *para, int level, int streamIndex);
+extern "C" void scatterNodesFromRecvBufferZGPU27AfterFtoC(Parameter *para, int level, int streamIndex);
+
+//////////////////////////////////////////////////////////////////////////
+// 3D domain decomposition convection diffusion
+extern "C" void exchangePreCollDataADXGPU27(Parameter *para, vf::gpu::Communicator &comm,
+                                            CudaMemoryManager *cudaManager, int level);
+extern "C" void exchangePreCollDataADYGPU27(Parameter *para, vf::gpu::Communicator &comm,
+                                            CudaMemoryManager *cudaManager, int level);
+extern "C" void exchangePreCollDataADZGPU27(Parameter *para, vf::gpu::Communicator &comm,
+                                            CudaMemoryManager *cudaManager, int level);
+extern "C" void exchangePostCollDataADXGPU27(Parameter *para, vf::gpu::Communicator &comm,
+                                             CudaMemoryManager *cudaManager, int level);
+extern "C" void exchangePostCollDataADYGPU27(Parameter *para, vf::gpu::Communicator &comm,
+                                             CudaMemoryManager *cudaManager, int level);
+extern "C" void exchangePostCollDataADZGPU27(Parameter *para, vf::gpu::Communicator &comm,
+                                             CudaMemoryManager *cudaManager, int level);
+//////////////////////////////////////////////////////////////////////////
+// 3D domain decomposition F3 - K18/K20
+extern "C" void exchangeCollDataF3XGPU(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                       int level);
+extern "C" void exchangeCollDataF3YGPU(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                       int level);
+extern "C" void exchangeCollDataF3ZGPU(Parameter *para, vf::gpu::Communicator &comm, CudaMemoryManager *cudaManager,
+                                       int level);
+//////////////////////////////////////////////////////////////////////////
+extern "C" void barrierGPU(vf::gpu::Communicator &comm);
 //////////////////////////////////////////////////////////////////////////
 
 #endif
diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27Test.cfg b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27Test.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..e414d4f3173e555b8944fa9637ebbd2023ce393c
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27Test.cfg
@@ -0,0 +1,3 @@
+# these two parameters need to be defined in each config file
+Path = /output/path
+GridPath = /path/to/grid
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27Test.cpp b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27Test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3afedfb061211a15b74573d4e6043e8c3e59671b
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Communication/ExchangeData27Test.cpp
@@ -0,0 +1,145 @@
+#include <gmock/gmock.h>
+
+#include <filesystem>
+
+#include "ExchangeData27.h"
+#include "gpu/VirtualFluids_GPU/LBM/LB.h"
+
+#include <basics/config/ConfigurationFile.h>
+
+SPtr<Parameter> initParameterClass()
+{
+    std::filesystem::path filePath = __FILE__; //  assuming that the config file is stored parallel to this file.
+    filePath.replace_filename("ExchangeData27Test.cfg");
+    vf::basics::ConfigurationFile config;
+    config.load(filePath.string());
+    return std::make_shared<Parameter>(config, 1, 0);
+}
+
+void setUpFsByCopyingF0(std::vector<real> &distributionVector, int numberOfNodes)
+{
+    for (uint direction = 0; direction < dirEND; direction++) {
+        distributionVector.insert(distributionVector.end(), distributionVector.begin(),
+                                  distributionVector.begin() + numberOfNodes);
+    }
+}
+
+class ExchangeData27Test_CopyEdgeNodesXZTest : public testing::Test
+{
+protected:
+    SPtr<Parameter> para;
+    int level    = 0;
+    int numNodes = 10;
+    std::vector<real> recvFs;
+    std::vector<real> sendFs;
+    std::vector<ProcessNeighbor27> sendProcessNeighborHost;
+    std::vector<ProcessNeighbor27> recvProcessNeighborHost;
+
+    void SetUp() override
+    {
+        para = initParameterClass();
+        para->setMaxLevel(level + 1);       // setMaxLevel resizes parH
+        para->initLBMSimulationParameter(); // init parH
+
+        para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 1, 0, 1);
+        para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 6, 0, 6);
+        para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 2, 0, 3);
+        para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 7, 0, 8);
+        para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 7, 0, 8);
+        para->getParH(level)->edgeNodesXtoZ.emplace_back(0, 7, 0, 8);
+    }
+
+    void setUpRecvProcessNeighbors(int numberOfNodesInRecv)
+    {
+        recvFs.resize(numberOfNodesInRecv);
+        std::fill(recvFs.begin(), recvFs.end(), 0.5); // 0.5s should not be copied
+        for (LBMSimulationParameter::EdgeNodePositions edgeNode : para->getParH(level)->edgeNodesXtoZ) {
+            if (edgeNode.indexInRecvBuffer > numberOfNodesInRecv) {
+                continue;
+            }
+            recvFs[edgeNode.indexInRecvBuffer] = 0.1; // 0.1s should be copied
+        }
+        setUpFsByCopyingF0(recvFs, numberOfNodesInRecv);
+
+        recvProcessNeighborHost.resize(1);
+        recvProcessNeighborHost[0].f[0]          = recvFs.data();
+        recvProcessNeighborHost[0].numberOfNodes = numberOfNodesInRecv;
+    }
+
+    void setUpSendProcessNeighbors(int numberOfNodesInSend)
+    {
+        sendFs.resize(27 * numberOfNodesInSend);
+        std::fill(sendFs.begin(), sendFs.end(), 0.0);
+
+        sendProcessNeighborHost.resize(1);
+        sendProcessNeighborHost[0].f[0]          = sendFs.data();
+        sendProcessNeighborHost[0].numberOfNodes = numberOfNodesInSend;
+    }
+};
+
+TEST_F(ExchangeData27Test_CopyEdgeNodesXZTest, copyEdgeNodes_XZ_CommunicationAfterFtoC_recvVectorFullSize)
+{
+    int numNodesAfterFtoC = 5; // indexInSend < 5 --> mode is in AfterFToC
+    setUpRecvProcessNeighbors(numNodes);
+    setUpSendProcessNeighbors(numNodesAfterFtoC);
+
+    // expected
+    std::vector<real> expectedFs(numNodesAfterFtoC, 0.0);
+    expectedFs[1] = 0.1;
+    expectedFs[3] = 0.1;
+    setUpFsByCopyingF0(expectedFs, numNodesAfterFtoC);
+
+    // act
+    copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, recvProcessNeighborHost, sendProcessNeighborHost);
+
+    // convert result to std::vector
+    std::vector<real> result;
+    result.assign(sendProcessNeighborHost[0].f[0], sendProcessNeighborHost[0].f[0] + 27 * numNodesAfterFtoC);
+
+    EXPECT_THAT(result, testing::Eq(expectedFs));
+}
+
+TEST_F(ExchangeData27Test_CopyEdgeNodesXZTest, copyEdgeNodes_XZ_CommunicationAfterFtoC_recvVectorShort)
+{
+    int numNodesAfterFtoC = 5; // indexInSend < 5 --> mode is in AfterFToC
+    setUpRecvProcessNeighbors(numNodesAfterFtoC);
+    setUpSendProcessNeighbors(numNodesAfterFtoC);
+
+    // expected
+    std::vector<real> expectedFs(numNodesAfterFtoC, 0.0);
+    expectedFs[1] = 0.1;
+    expectedFs[3] = 0.1;
+    setUpFsByCopyingF0(expectedFs, numNodesAfterFtoC);
+
+    // act
+    copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, recvProcessNeighborHost, sendProcessNeighborHost);
+
+    // convert result to std::vector
+    std::vector<real> result;
+    result.assign(sendProcessNeighborHost[0].f[0], sendProcessNeighborHost[0].f[0] + 27 * numNodesAfterFtoC);
+
+    EXPECT_THAT(result, testing::Eq(expectedFs));
+}
+
+TEST_F(ExchangeData27Test_CopyEdgeNodesXZTest, copyEdgeNodes_XZ_CommunicateAll)
+{
+    setUpRecvProcessNeighbors(numNodes);
+    setUpSendProcessNeighbors(numNodes);
+
+    // expected
+    std::vector<real> expectedFs(numNodes, 0.0);
+    expectedFs[1] = 0.1;
+    expectedFs[3] = 0.1;
+    expectedFs[6] = 0.1;
+    expectedFs[8] = 0.1;
+    setUpFsByCopyingF0(expectedFs, numNodes);
+
+    // act
+    copyEdgeNodes(para->getParH(level)->edgeNodesXtoZ, recvProcessNeighborHost, sendProcessNeighborHost);
+
+    // convert result to std::vector
+    std::vector<real> result;
+    result.assign(sendProcessNeighborHost[0].f[0], sendProcessNeighborHost[0].f[0] + 27 * numNodes);
+
+    EXPECT_THAT(result, testing::Eq(expectedFs));
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.cpp
index f21ee67a1054f6395b84377cbb71c3b2ff9ceaec..00e81c832ede87a7c2cd2e5f95dba0ec3a865b2d 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.cpp
@@ -9,9 +9,9 @@
 #include <GPU/CudaMemoryManager.h>
 
 
-std::shared_ptr<GridProvider> GridProvider::makeGridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaManager)
+std::shared_ptr<GridProvider> GridProvider::makeGridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaManager, vf::gpu::Communicator& communicator)
 {
-    return std::shared_ptr<GridProvider>(new GridGenerator(builder, para, cudaManager));
+    return std::shared_ptr<GridProvider>(new GridGenerator(builder, para, cudaManager, communicator));
 }
 
 std::shared_ptr<GridProvider> GridProvider::makeGridReader(FILEFORMAT format, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaManager)
@@ -29,6 +29,17 @@ void GridProvider::setNumberOfNodes(const int numberOfNodes, const int level) co
     para->getParD(level)->mem_size_int_SP = sizeof(uint) * para->getParD(level)->size_Mat_SP;
 }
 
+void GridProvider::setNumberOfFluidNodes(const int numberOfNodes, const int level) const
+{
+    para->getParH(level)->numberOfFluidNodes = numberOfNodes;
+    para->getParD(level)->numberOfFluidNodes = numberOfNodes;
+}
+
+void GridProvider::setNumberOfFluidNodesBorder(const int numberOfNodes, const int level) const {
+    para->getParH(level)->numberOffluidNodesBorder = numberOfNodes;
+    para->getParD(level)->numberOffluidNodesBorder = numberOfNodes;
+}
+
 void GridProvider::setInitalNodeValues(const int numberOfNodes, const int level) const
 {
     for (int j = 1; j <= numberOfNodes; j++)
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h
index 3f2834e1157f4e7f9c44fb9db8987d1b0e679a5e..d8d9d6b02aa813cd6bacad503b3089b35dc8fa98 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h
@@ -8,8 +8,11 @@
 
 #include "PointerDefinitions.h"
 #include "VirtualFluids_GPU_export.h"
-
 #include "gpu/GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
+namespace vf::gpu
+{
+class Communicator;
+}
 
 class Parameter;
 class GridBuilder;
@@ -18,13 +21,16 @@ class CudaMemoryManager;
 class VIRTUALFLUIDS_GPU_EXPORT GridProvider
 {
 public:
-    static std::shared_ptr<GridProvider> makeGridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaManager);
+    static std::shared_ptr<GridProvider> makeGridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaManager, vf::gpu::Communicator& communicator);
     static std::shared_ptr<GridProvider> makeGridReader(FILEFORMAT format, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaManager);
 
 	virtual void allocArrays_CoordNeighborGeo() = 0;
 	virtual void allocArrays_BoundaryValues() = 0;
 	virtual void allocArrays_BoundaryQs() = 0;
     virtual void allocArrays_OffsetScale() = 0;
+    virtual void allocArrays_fluidNodeIndices() = 0;
+    virtual void allocArrays_fluidNodeIndicesBorder() = 0;
+
 	virtual void setDimensions() = 0;
 	virtual void setBoundingBox() = 0;
 	virtual void initPeriodicNeigh(std::vector<std::vector<std::vector<unsigned int> > > periodV, std::vector<std::vector<unsigned int> > periodIndex, std::string way) = 0;
@@ -39,6 +45,8 @@ public:
 
 protected:
 	void setNumberOfNodes(const int numberOfNodes, const int level) const;
+    void setNumberOfFluidNodes(const int numberOfNodes, const int level) const;
+    void setNumberOfFluidNodesBorder(const int numberOfNodes, const int level) const;
     virtual void setInitalNodeValues(const int numberOfNodes, const int level) const;
 
 	void setPressSizePerLevel(int level, int sizePerLevel) const;
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.cpp
index 940dbfa617ccd06f5d7b77527cc78b618062240a..c18a0186c12fa066103aa249270419aa46cb1827 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.cpp
@@ -216,6 +216,16 @@ void GridReader::allocArrays_OffsetScale()
     std::cout << "-----Ende OffsetScale------" << std::endl;
 }
 
+void GridReader::allocArrays_fluidNodeIndices() {
+    std::cout << "GridReader::allocArrays_fluidNodeIndices not implemented" << std::endl;
+	// TODO
+}
+
+void GridReader::allocArrays_fluidNodeIndicesBorder() {
+    std::cout << "GridReader::allocArrays_fluidNodeIndicesBorder not implemented" << std::endl;
+    // TODO
+}
+
 
 void GridReader::setPressureValues(int channelSide) const
 {
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h
index f7a4c43062da79d39c43e6822688c51ad55e7442..ae592f9bde145b4008581d62772e0a31cb7237f8 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h
@@ -39,6 +39,8 @@ public:
 	void allocArrays_CoordNeighborGeo() override;
 	void allocArrays_BoundaryValues() override;
     void allocArrays_OffsetScale() override;
+    void allocArrays_fluidNodeIndices() override;
+    void allocArrays_fluidNodeIndicesBorder() override;
 
 	void initalValuesDomainDecompostion(int level);
 
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
index 9f2bfa4d2ac004237d7a7e62d04496089b05db61..87f9bf33d0e2b57588034f5f28e15c42d6bf462f 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
@@ -3,22 +3,23 @@
 #include "Parameter/Parameter.h"
 #include <GridGenerator/grid/GridBuilder/GridBuilder.h>
 #include <GPU/CudaMemoryManager.h>
+#include "IndexRearrangementForStreams.h"
 
 #include <sstream>
 #include <iostream>
+#include <algorithm>
 #include "utilities/math/Math.h"
-#include "LBM/LB.h"
 #include "Output/QDebugWriter.hpp"
 
 #include "utilities/communication.h"
 
 
-
-GridGenerator::GridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaManager)
+GridGenerator::GridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaManager, vf::gpu::Communicator& communicator)
 {
 	this->builder = builder;
     this->para = para;
     this->cudaMemoryManager = cudaManager;
+    this->indexRearrangement = std::make_unique<IndexRearrangementForStreams>(para, builder, communicator);
 }
 
 GridGenerator::~GridGenerator()
@@ -92,6 +93,24 @@ void GridGenerator::allocArrays_CoordNeighborGeo()
 	std::cout << "-----finish Coord, Neighbor, Geo------" << std::endl;
 }
 
+void GridGenerator::allocArrays_fluidNodeIndices() {
+    for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
+        setNumberOfFluidNodes(builder->getNumberOfFluidNodes(level), level);
+        cudaMemoryManager->cudaAllocFluidNodeIndices(level);
+        builder->getFluidNodeIndices(para->getParH(level)->fluidNodeIndices, level);
+        cudaMemoryManager->cudaCopyFluidNodeIndices(level);
+    }    
+}
+
+void GridGenerator::allocArrays_fluidNodeIndicesBorder() {
+    for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
+        setNumberOfFluidNodesBorder(builder->getNumberOfFluidNodesBorder(level), level);
+        cudaMemoryManager->cudaAllocFluidNodeIndicesBorder(level);
+        builder->getFluidNodeIndicesBorder(para->getParH(level)->fluidNodeIndicesBorder, level);
+        cudaMemoryManager->cudaCopyFluidNodeIndicesBorder(level);
+    }
+}
+
 void GridGenerator::allocArrays_BoundaryValues()
 {
 	std::cout << "------read BoundaryValues------" << std::endl;
@@ -283,377 +302,438 @@ void GridGenerator::allocArrays_BoundaryValues()
         }
     }//ende geo
 
-    if ((para->getNumprocs() > 1) /*&& (procNeighborsSendX.size() == procNeighborsRecvX.size())*/)
-	{
-		for (int direction = 0; direction < 6; direction++)
-		{
-            if( builder->getCommunicationProcess(direction) == INVALID_INDEX ) continue;
+    initalValuesDomainDecompostion();
+}
 
-			for (uint level = 0; level < builder->getNumberOfGridLevels(); level++)
-            {
-                if( direction == CommunicationDirections::MX || direction == CommunicationDirections::PX )
-                {
+void GridGenerator::initalValuesDomainDecompostion()
+{
+    if (para->getNumprocs() < 2)
+        return;
+    if ((para->getNumprocs() > 1) /*&& (procNeighborsSendX.size() == procNeighborsRecvX.size())*/) {
+        for (int direction = 0; direction < 6; direction++) {
+            if (builder->getCommunicationProcess(direction) == INVALID_INDEX)
+                continue;
+
+            for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
+                if (direction == CommunicationDirections::MX || direction == CommunicationDirections::PX) {
                     int j = (int)para->getParH(level)->sendProcessNeighborX.size();
 
-		            para->getParH(level)->sendProcessNeighborX.emplace_back();
-		            para->getParD(level)->sendProcessNeighborX.emplace_back();
-		            para->getParH(level)->recvProcessNeighborX.emplace_back();
-		            para->getParD(level)->recvProcessNeighborX.emplace_back();
-		            if (para->getDiffOn()==true){
-			            para->getParH(level)->sendProcessNeighborADX.emplace_back();
-			            para->getParD(level)->sendProcessNeighborADX.emplace_back();
-			            para->getParH(level)->recvProcessNeighborADX.emplace_back();
-			            para->getParD(level)->recvProcessNeighborADX.emplace_back();
-		            }
-
-				    int tempSend = builder->getNumberOfSendIndices( direction, level );
-				    int tempRecv = builder->getNumberOfReceiveIndices( direction, level );
-				    if (tempSend > 0)
-				    {
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //send
-					    std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend << std::endl;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->sendProcessNeighborX.back().rankNeighbor = builder->getCommunicationProcess(direction);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->sendProcessNeighborX.back().numberOfNodes = tempSend;
-					    para->getParD(level)->sendProcessNeighborX.back().numberOfNodes = tempSend;
-					    para->getParH(level)->sendProcessNeighborX.back().numberOfFs = para->getD3Qxx() * tempSend;
-					    para->getParD(level)->sendProcessNeighborX.back().numberOfFs = para->getD3Qxx() * tempSend;
-					    para->getParH(level)->sendProcessNeighborX.back().memsizeIndex = sizeof(unsigned int)*tempSend;
-					    para->getParD(level)->sendProcessNeighborX.back().memsizeIndex = sizeof(unsigned int)*tempSend;
-					    para->getParH(level)->sendProcessNeighborX.back().memsizeFs = sizeof(real)     *tempSend;
-					    para->getParD(level)->sendProcessNeighborX.back().memsizeFs = sizeof(real)     *tempSend;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //recv
-					    std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv << std::endl;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->recvProcessNeighborX.back().rankNeighbor = builder->getCommunicationProcess(direction);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->recvProcessNeighborX.back().numberOfNodes = tempRecv;
-					    para->getParD(level)->recvProcessNeighborX.back().numberOfNodes = tempRecv;
-					    para->getParH(level)->recvProcessNeighborX.back().numberOfFs = para->getD3Qxx() * tempRecv;
-					    para->getParD(level)->recvProcessNeighborX.back().numberOfFs = para->getD3Qxx() * tempRecv;
-					    para->getParH(level)->recvProcessNeighborX.back().memsizeIndex = sizeof(unsigned int)*tempRecv;
-					    para->getParD(level)->recvProcessNeighborX.back().memsizeIndex = sizeof(unsigned int)*tempRecv;
-					    para->getParH(level)->recvProcessNeighborX.back().memsizeFs = sizeof(real)     *tempRecv;
-					    para->getParD(level)->recvProcessNeighborX.back().memsizeFs = sizeof(real)     *tempRecv;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //malloc on host and device
+                    para->getParH(level)->sendProcessNeighborX.emplace_back();
+                    para->getParD(level)->sendProcessNeighborX.emplace_back();
+                    para->getParH(level)->recvProcessNeighborX.emplace_back();
+                    para->getParD(level)->recvProcessNeighborX.emplace_back();
+                    if (para->getDiffOn() == true) {
+                        para->getParH(level)->sendProcessNeighborADX.emplace_back();
+                        para->getParD(level)->sendProcessNeighborADX.emplace_back();
+                        para->getParH(level)->recvProcessNeighborADX.emplace_back();
+                        para->getParD(level)->recvProcessNeighborADX.emplace_back();
+                    }
+
+                    int tempSend = builder->getNumberOfSendIndices(direction, level);
+                    int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
+                    if (tempSend > 0) {
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // send
+                        std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborX.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborX.back().numberOfNodes = tempSend;
+                        para->getParD(level)->sendProcessNeighborX.back().numberOfNodes = tempSend;
+                        para->getParH(level)->sendProcessNeighborX.back().numberOfFs    = para->getD3Qxx() * tempSend;
+                        para->getParD(level)->sendProcessNeighborX.back().numberOfFs    = para->getD3Qxx() * tempSend;
+                        para->getParH(level)->sendProcessNeighborX.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParD(level)->sendProcessNeighborX.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParH(level)->sendProcessNeighborX.back().memsizeFs = sizeof(real) * tempSend;
+                        para->getParD(level)->sendProcessNeighborX.back().memsizeFs = sizeof(real) * tempSend;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // recv
+                        std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborX.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborX.back().numberOfNodes = tempRecv;
+                        para->getParD(level)->recvProcessNeighborX.back().numberOfNodes = tempRecv;
+                        para->getParH(level)->recvProcessNeighborX.back().numberOfFs    = para->getD3Qxx() * tempRecv;
+                        para->getParD(level)->recvProcessNeighborX.back().numberOfFs    = para->getD3Qxx() * tempRecv;
+                        para->getParH(level)->recvProcessNeighborX.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborX.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParH(level)->recvProcessNeighborX.back().memsizeFs = sizeof(real) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborX.back().memsizeFs = sizeof(real) * tempRecv;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // malloc on host and device
                         cudaMemoryManager->cudaAllocProcessNeighborX(level, j);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //init index arrays
-                        builder->getSendIndices   (para->getParH(level)->sendProcessNeighborX[j].index, direction, level);
-                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborX[j].index, direction, level);
-					    ////////////////////////////////////////////////////////////////////////////////////////
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // init index arrays
+                        builder->getSendIndices(para->getParH(level)->sendProcessNeighborX[j].index, direction, level);
+                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborX[j].index, direction,
+                                                   level);
+                        if (level != builder->getNumberOfGridLevels() - 1 && para->useReducedCommunicationAfterFtoC)
+                            indexRearrangement->initCommunicationArraysForCommAfterFinetoCoarseX(level, j, direction);             
+                        ////////////////////////////////////////////////////////////////////////////////////////
                         cudaMemoryManager->cudaCopyProcessNeighborXIndex(level, j);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-				    }
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                    }
                 }
-                
-                if( direction == CommunicationDirections::MY || direction == CommunicationDirections::PY )
-                {
+
+                if (direction == CommunicationDirections::MY || direction == CommunicationDirections::PY) {
                     int j = (int)para->getParH(level)->sendProcessNeighborY.size();
 
-		            para->getParH(level)->sendProcessNeighborY.emplace_back();
-		            para->getParD(level)->sendProcessNeighborY.emplace_back();
-		            para->getParH(level)->recvProcessNeighborY.emplace_back();
-		            para->getParD(level)->recvProcessNeighborY.emplace_back();
-		            if (para->getDiffOn()==true){
-			            para->getParH(level)->sendProcessNeighborADY.emplace_back();
-			            para->getParD(level)->sendProcessNeighborADY.emplace_back();
-			            para->getParH(level)->recvProcessNeighborADY.emplace_back();
-			            para->getParD(level)->recvProcessNeighborADY.emplace_back();
-		            }
-
-				    int tempSend = builder->getNumberOfSendIndices( direction, level );
-				    int tempRecv = builder->getNumberOfReceiveIndices( direction, level );
-				    if (tempSend > 0)
-				    {
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //send
-					    std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend << std::endl;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->sendProcessNeighborY.back().rankNeighbor = builder->getCommunicationProcess(direction);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->sendProcessNeighborY.back().numberOfNodes = tempSend;
-					    para->getParD(level)->sendProcessNeighborY.back().numberOfNodes = tempSend;
-					    para->getParH(level)->sendProcessNeighborY.back().numberOfFs = para->getD3Qxx() * tempSend;
-					    para->getParD(level)->sendProcessNeighborY.back().numberOfFs = para->getD3Qxx() * tempSend;
-					    para->getParH(level)->sendProcessNeighborY.back().memsizeIndex = sizeof(unsigned int)*tempSend;
-					    para->getParD(level)->sendProcessNeighborY.back().memsizeIndex = sizeof(unsigned int)*tempSend;
-					    para->getParH(level)->sendProcessNeighborY.back().memsizeFs = sizeof(real)     *tempSend;
-					    para->getParD(level)->sendProcessNeighborY.back().memsizeFs = sizeof(real)     *tempSend;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //recv
-					    std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv << std::endl;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->recvProcessNeighborY.back().rankNeighbor = builder->getCommunicationProcess(direction);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->recvProcessNeighborY.back().numberOfNodes = tempRecv;
-					    para->getParD(level)->recvProcessNeighborY.back().numberOfNodes = tempRecv;
-					    para->getParH(level)->recvProcessNeighborY.back().numberOfFs = para->getD3Qxx() * tempRecv;
-					    para->getParD(level)->recvProcessNeighborY.back().numberOfFs = para->getD3Qxx() * tempRecv;
-					    para->getParH(level)->recvProcessNeighborY.back().memsizeIndex = sizeof(unsigned int)*tempRecv;
-					    para->getParD(level)->recvProcessNeighborY.back().memsizeIndex = sizeof(unsigned int)*tempRecv;
-					    para->getParH(level)->recvProcessNeighborY.back().memsizeFs = sizeof(real)     *tempRecv;
-					    para->getParD(level)->recvProcessNeighborY.back().memsizeFs = sizeof(real)     *tempRecv;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //malloc on host and device
+                    para->getParH(level)->sendProcessNeighborY.emplace_back();
+                    para->getParD(level)->sendProcessNeighborY.emplace_back();
+                    para->getParH(level)->recvProcessNeighborY.emplace_back();
+                    para->getParD(level)->recvProcessNeighborY.emplace_back();
+                    if (para->getDiffOn() == true) {
+                        para->getParH(level)->sendProcessNeighborADY.emplace_back();
+                        para->getParD(level)->sendProcessNeighborADY.emplace_back();
+                        para->getParH(level)->recvProcessNeighborADY.emplace_back();
+                        para->getParD(level)->recvProcessNeighborADY.emplace_back();
+                    }
+
+                    int tempSend = builder->getNumberOfSendIndices(direction, level);
+                    int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
+                    if (tempSend > 0) {
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // send
+                        std::cout << "size of Data for Y send buffer, Level " << level << " : " << tempSend
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborY.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborY.back().numberOfNodes = tempSend;
+                        para->getParD(level)->sendProcessNeighborY.back().numberOfNodes = tempSend;
+                        para->getParH(level)->sendProcessNeighborY.back().numberOfFs    = para->getD3Qxx() * tempSend;
+                        para->getParD(level)->sendProcessNeighborY.back().numberOfFs    = para->getD3Qxx() * tempSend;
+                        para->getParH(level)->sendProcessNeighborY.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParD(level)->sendProcessNeighborY.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParH(level)->sendProcessNeighborY.back().memsizeFs = sizeof(real) * tempSend;
+                        para->getParD(level)->sendProcessNeighborY.back().memsizeFs = sizeof(real) * tempSend;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // recv
+                        std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborY.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborY.back().numberOfNodes = tempRecv;
+                        para->getParD(level)->recvProcessNeighborY.back().numberOfNodes = tempRecv;
+                        para->getParH(level)->recvProcessNeighborY.back().numberOfFs    = para->getD3Qxx() * tempRecv;
+                        para->getParD(level)->recvProcessNeighborY.back().numberOfFs    = para->getD3Qxx() * tempRecv;
+                        para->getParH(level)->recvProcessNeighborY.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborY.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParH(level)->recvProcessNeighborY.back().memsizeFs = sizeof(real) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborY.back().memsizeFs = sizeof(real) * tempRecv;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // malloc on host and device
                         cudaMemoryManager->cudaAllocProcessNeighborY(level, j);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //init index arrays
-                        builder->getSendIndices   (para->getParH(level)->sendProcessNeighborY[j].index, direction, level);
-                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborY[j].index, direction, level);
-					    ////////////////////////////////////////////////////////////////////////////////////////
+                        ////////////////////////////////////////////////////////////////////////////////////////                        
+                        // init index arrays
+                        builder->getSendIndices(para->getParH(level)->sendProcessNeighborY[j].index, direction, level);
+                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborY[j].index, direction,
+                                                   level);
+                        if (level != builder->getNumberOfGridLevels() - 1 && para->useReducedCommunicationAfterFtoC)
+                            indexRearrangement->initCommunicationArraysForCommAfterFinetoCoarseY(level, j, direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
                         cudaMemoryManager->cudaCopyProcessNeighborYIndex(level, j);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-				    }
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                    }
                 }
-                
-                if( direction == CommunicationDirections::MZ || direction == CommunicationDirections::PZ )
-                {
+
+                if (direction == CommunicationDirections::MZ || direction == CommunicationDirections::PZ) {
                     int j = (int)para->getParH(level)->sendProcessNeighborZ.size();
 
-		            para->getParH(level)->sendProcessNeighborZ.emplace_back();
-		            para->getParD(level)->sendProcessNeighborZ.emplace_back();
-		            para->getParH(level)->recvProcessNeighborZ.emplace_back();
-		            para->getParD(level)->recvProcessNeighborZ.emplace_back();
-		            if (para->getDiffOn()==true){
-			            para->getParH(level)->sendProcessNeighborADZ.emplace_back();
-			            para->getParD(level)->sendProcessNeighborADZ.emplace_back();
-			            para->getParH(level)->recvProcessNeighborADZ.emplace_back();
-			            para->getParD(level)->recvProcessNeighborADZ.emplace_back();
-		            }
-
-				    int tempSend = builder->getNumberOfSendIndices( direction, level );
-				    int tempRecv = builder->getNumberOfReceiveIndices( direction, level );
-				    if (tempSend > 0)
-				    {
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //send
-					    std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend << std::endl;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->sendProcessNeighborZ.back().rankNeighbor = builder->getCommunicationProcess(direction);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->sendProcessNeighborZ.back().numberOfNodes = tempSend;
-					    para->getParD(level)->sendProcessNeighborZ.back().numberOfNodes = tempSend;
-					    para->getParH(level)->sendProcessNeighborZ.back().numberOfFs = para->getD3Qxx() * tempSend;
-					    para->getParD(level)->sendProcessNeighborZ.back().numberOfFs = para->getD3Qxx() * tempSend;
-					    para->getParH(level)->sendProcessNeighborZ.back().memsizeIndex = sizeof(unsigned int)*tempSend;
-					    para->getParD(level)->sendProcessNeighborZ.back().memsizeIndex = sizeof(unsigned int)*tempSend;
-					    para->getParH(level)->sendProcessNeighborZ.back().memsizeFs = sizeof(real)     *tempSend;
-					    para->getParD(level)->sendProcessNeighborZ.back().memsizeFs = sizeof(real)     *tempSend;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //recv
-					    std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv << std::endl;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->recvProcessNeighborZ.back().rankNeighbor = builder->getCommunicationProcess(direction);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    para->getParH(level)->recvProcessNeighborZ.back().numberOfNodes = tempRecv;
-					    para->getParD(level)->recvProcessNeighborZ.back().numberOfNodes = tempRecv;
-					    para->getParH(level)->recvProcessNeighborZ.back().numberOfFs = para->getD3Qxx() * tempRecv;
-					    para->getParD(level)->recvProcessNeighborZ.back().numberOfFs = para->getD3Qxx() * tempRecv;
-					    para->getParH(level)->recvProcessNeighborZ.back().memsizeIndex = sizeof(unsigned int)*tempRecv;
-					    para->getParD(level)->recvProcessNeighborZ.back().memsizeIndex = sizeof(unsigned int)*tempRecv;
-					    para->getParH(level)->recvProcessNeighborZ.back().memsizeFs = sizeof(real)     *tempRecv;
-					    para->getParD(level)->recvProcessNeighborZ.back().memsizeFs = sizeof(real)     *tempRecv;
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //malloc on host and device
+                    para->getParH(level)->sendProcessNeighborZ.emplace_back();
+                    para->getParD(level)->sendProcessNeighborZ.emplace_back();
+                    para->getParH(level)->recvProcessNeighborZ.emplace_back();
+                    para->getParD(level)->recvProcessNeighborZ.emplace_back();
+                    if (para->getDiffOn() == true) {
+                        para->getParH(level)->sendProcessNeighborADZ.emplace_back();
+                        para->getParD(level)->sendProcessNeighborADZ.emplace_back();
+                        para->getParH(level)->recvProcessNeighborADZ.emplace_back();
+                        para->getParD(level)->recvProcessNeighborADZ.emplace_back();
+                    }
+
+                    int tempSend = builder->getNumberOfSendIndices(direction, level);
+                    int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
+                    if (tempSend > 0) {
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // send
+                        std::cout << "size of Data for Z send buffer, Level " << level << " : " << tempSend
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborZ.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborZ.back().numberOfNodes = tempSend;
+                        para->getParD(level)->sendProcessNeighborZ.back().numberOfNodes = tempSend;
+                        para->getParH(level)->sendProcessNeighborZ.back().numberOfFs    = para->getD3Qxx() * tempSend;
+                        para->getParD(level)->sendProcessNeighborZ.back().numberOfFs    = para->getD3Qxx() * tempSend;
+                        para->getParH(level)->sendProcessNeighborZ.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParD(level)->sendProcessNeighborZ.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParH(level)->sendProcessNeighborZ.back().memsizeFs = sizeof(real) * tempSend;
+                        para->getParD(level)->sendProcessNeighborZ.back().memsizeFs = sizeof(real) * tempSend;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // recv
+                        std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborZ.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborZ.back().numberOfNodes = tempRecv;
+                        para->getParD(level)->recvProcessNeighborZ.back().numberOfNodes = tempRecv;
+                        para->getParH(level)->recvProcessNeighborZ.back().numberOfFs    = para->getD3Qxx() * tempRecv;
+                        para->getParD(level)->recvProcessNeighborZ.back().numberOfFs    = para->getD3Qxx() * tempRecv;
+                        para->getParH(level)->recvProcessNeighborZ.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborZ.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParH(level)->recvProcessNeighborZ.back().memsizeFs = sizeof(real) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborZ.back().memsizeFs = sizeof(real) * tempRecv;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // malloc on host and device
                         cudaMemoryManager->cudaAllocProcessNeighborZ(level, j);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-					    //init index arrays
-                        builder->getSendIndices   (para->getParH(level)->sendProcessNeighborZ[j].index, direction, level);
-                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborZ[j].index, direction, level);
-					    ////////////////////////////////////////////////////////////////////////////////////////
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // init index arrays
+                        builder->getSendIndices(para->getParH(level)->sendProcessNeighborZ[j].index, direction, level);
+                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborZ[j].index, direction,
+                                                   level);
+                        if (level != builder->getNumberOfGridLevels() - 1 && para->useReducedCommunicationAfterFtoC)
+                            indexRearrangement->initCommunicationArraysForCommAfterFinetoCoarseZ(level, j, direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
                         cudaMemoryManager->cudaCopyProcessNeighborZIndex(level, j);
-					    ////////////////////////////////////////////////////////////////////////////////////////
-				    }
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                    }
                 }
+            }
+        }
+    }
 
-			}
-		}
-	}
-
+    // data exchange for F3 / G6
+    if ((para->getNumprocs() > 1) && (para->getIsF3())) {
+        for (int direction = 0; direction < 6; direction++) {
+            if (builder->getCommunicationProcess(direction) == INVALID_INDEX)
+                continue;
 
-	// data exchange for F3 / G6
-	if ((para->getNumprocs() > 1) && (para->getIsF3()) )
-	{
-		for (int direction = 0; direction < 6; direction++)
-		{
-			if (builder->getCommunicationProcess(direction) == INVALID_INDEX) continue;
-
-			for (uint level = 0; level < builder->getNumberOfGridLevels(); level++)
-			{
-				if (direction == CommunicationDirections::MX || direction == CommunicationDirections::PX)
-				{
+            for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
+                if (direction == CommunicationDirections::MX || direction == CommunicationDirections::PX) {
                     int j = (int)para->getParH(level)->sendProcessNeighborF3X.size();
 
-					para->getParH(level)->sendProcessNeighborF3X.emplace_back();
-					para->getParD(level)->sendProcessNeighborF3X.emplace_back();
-					para->getParH(level)->recvProcessNeighborF3X.emplace_back();
-					para->getParD(level)->recvProcessNeighborF3X.emplace_back();
-
-					int tempSend = builder->getNumberOfSendIndices(direction, level);
-					int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
-					if (tempSend > 0)
-					{
-						////////////////////////////////////////////////////////////////////////////////////////
-						//send
-						std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend << std::endl;
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->sendProcessNeighborF3X.back().rankNeighbor = builder->getCommunicationProcess(direction);
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->sendProcessNeighborF3X.back().numberOfNodes = tempSend;
-						para->getParD(level)->sendProcessNeighborF3X.back().numberOfNodes = tempSend;
-						para->getParH(level)->sendProcessNeighborF3X.back().numberOfGs = 6 * tempSend;
-						para->getParD(level)->sendProcessNeighborF3X.back().numberOfGs = 6 * tempSend;
-						para->getParH(level)->sendProcessNeighborF3X.back().memsizeIndex = sizeof(unsigned int) * tempSend;
-						para->getParD(level)->sendProcessNeighborF3X.back().memsizeIndex = sizeof(unsigned int) * tempSend;
-						para->getParH(level)->sendProcessNeighborF3X.back().memsizeGs = sizeof(real) * para->getParH(level)->sendProcessNeighborF3X.back().numberOfGs;
-						para->getParD(level)->sendProcessNeighborF3X.back().memsizeGs = sizeof(real) * para->getParH(level)->sendProcessNeighborF3X.back().numberOfGs;
-						////////////////////////////////////////////////////////////////////////////////////////
-						//recv
-						std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv << std::endl;
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->recvProcessNeighborF3X.back().rankNeighbor = builder->getCommunicationProcess(direction);
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->recvProcessNeighborF3X.back().numberOfNodes = tempRecv;
-						para->getParD(level)->recvProcessNeighborF3X.back().numberOfNodes = tempRecv;
-						para->getParH(level)->recvProcessNeighborF3X.back().numberOfGs = 6 * tempRecv;
-						para->getParD(level)->recvProcessNeighborF3X.back().numberOfGs = 6 * tempRecv;
-						para->getParH(level)->recvProcessNeighborF3X.back().memsizeIndex = sizeof(unsigned int) * tempRecv;
-						para->getParD(level)->recvProcessNeighborF3X.back().memsizeIndex = sizeof(unsigned int) * tempRecv;
-						para->getParH(level)->recvProcessNeighborF3X.back().memsizeGs = sizeof(real) * para->getParH(level)->recvProcessNeighborF3X.back().numberOfGs;
-						para->getParD(level)->recvProcessNeighborF3X.back().memsizeGs = sizeof(real) * para->getParH(level)->recvProcessNeighborF3X.back().numberOfGs;
-						////////////////////////////////////////////////////////////////////////////////////////
-						//malloc on host and device
-						cudaMemoryManager->cudaAllocProcessNeighborF3X(level, j);
-						////////////////////////////////////////////////////////////////////////////////////////
-						//init index arrays
-						builder->getSendIndices(para->getParH(level)->sendProcessNeighborF3X[j].index, direction, level);
-						builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborF3X[j].index, direction, level);
-						////////////////////////////////////////////////////////////////////////////////////////
-						cudaMemoryManager->cudaCopyProcessNeighborF3XIndex(level, j);
-						////////////////////////////////////////////////////////////////////////////////////////
-					}
-				}
-
-				if (direction == CommunicationDirections::MY || direction == CommunicationDirections::PY)
-				{
+                    para->getParH(level)->sendProcessNeighborF3X.emplace_back();
+                    para->getParD(level)->sendProcessNeighborF3X.emplace_back();
+                    para->getParH(level)->recvProcessNeighborF3X.emplace_back();
+                    para->getParD(level)->recvProcessNeighborF3X.emplace_back();
+
+                    int tempSend = builder->getNumberOfSendIndices(direction, level);
+                    int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
+                    if (tempSend > 0) {
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // send
+                        std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborF3X.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborF3X.back().numberOfNodes = tempSend;
+                        para->getParD(level)->sendProcessNeighborF3X.back().numberOfNodes = tempSend;
+                        para->getParH(level)->sendProcessNeighborF3X.back().numberOfGs    = 6 * tempSend;
+                        para->getParD(level)->sendProcessNeighborF3X.back().numberOfGs    = 6 * tempSend;
+                        para->getParH(level)->sendProcessNeighborF3X.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParD(level)->sendProcessNeighborF3X.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParH(level)->sendProcessNeighborF3X.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->sendProcessNeighborF3X.back().numberOfGs;
+                        para->getParD(level)->sendProcessNeighborF3X.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->sendProcessNeighborF3X.back().numberOfGs;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // recv
+                        std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborF3X.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborF3X.back().numberOfNodes = tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3X.back().numberOfNodes = tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3X.back().numberOfGs    = 6 * tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3X.back().numberOfGs    = 6 * tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3X.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3X.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3X.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->recvProcessNeighborF3X.back().numberOfGs;
+                        para->getParD(level)->recvProcessNeighborF3X.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->recvProcessNeighborF3X.back().numberOfGs;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // malloc on host and device
+                        cudaMemoryManager->cudaAllocProcessNeighborF3X(level, j);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // init index arrays
+                        builder->getSendIndices(para->getParH(level)->sendProcessNeighborF3X[j].index, direction,
+                                                level);
+                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborF3X[j].index, direction,
+                                                   level);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        cudaMemoryManager->cudaCopyProcessNeighborF3XIndex(level, j);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                    }
+                }
+
+                if (direction == CommunicationDirections::MY || direction == CommunicationDirections::PY) {
                     int j = (int)para->getParH(level)->sendProcessNeighborF3Y.size();
 
-					para->getParH(level)->sendProcessNeighborF3Y.emplace_back();
-					para->getParD(level)->sendProcessNeighborF3Y.emplace_back();
-					para->getParH(level)->recvProcessNeighborF3Y.emplace_back();
-					para->getParD(level)->recvProcessNeighborF3Y.emplace_back();
-
-					int tempSend = builder->getNumberOfSendIndices(direction, level);
-					int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
-					if (tempSend > 0)
-					{
-						////////////////////////////////////////////////////////////////////////////////////////
-						//send
-						std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend << std::endl;
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->sendProcessNeighborF3Y.back().rankNeighbor = builder->getCommunicationProcess(direction);
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->sendProcessNeighborF3Y.back().numberOfNodes = tempSend;
-						para->getParD(level)->sendProcessNeighborF3Y.back().numberOfNodes = tempSend;
-						para->getParH(level)->sendProcessNeighborF3Y.back().numberOfGs = 6 * tempSend;
-						para->getParD(level)->sendProcessNeighborF3Y.back().numberOfGs = 6 * tempSend;
-						para->getParH(level)->sendProcessNeighborF3Y.back().memsizeIndex = sizeof(unsigned int) * tempSend;
-						para->getParD(level)->sendProcessNeighborF3Y.back().memsizeIndex = sizeof(unsigned int) * tempSend;
-						para->getParH(level)->sendProcessNeighborF3Y.back().memsizeGs = sizeof(real) * para->getParH(level)->sendProcessNeighborF3Y.back().numberOfGs;
-						para->getParD(level)->sendProcessNeighborF3Y.back().memsizeGs = sizeof(real) * para->getParH(level)->sendProcessNeighborF3Y.back().numberOfGs;
-						////////////////////////////////////////////////////////////////////////////////////////
-						//recv
-						std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv << std::endl;
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->recvProcessNeighborF3Y.back().rankNeighbor = builder->getCommunicationProcess(direction);
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->recvProcessNeighborF3Y.back().numberOfNodes = tempRecv;
-						para->getParD(level)->recvProcessNeighborF3Y.back().numberOfNodes = tempRecv;
-						para->getParH(level)->recvProcessNeighborF3Y.back().numberOfGs = 6 * tempRecv;
-						para->getParD(level)->recvProcessNeighborF3Y.back().numberOfGs = 6 * tempRecv;
-						para->getParH(level)->recvProcessNeighborF3Y.back().memsizeIndex = sizeof(unsigned int) * tempRecv;
-						para->getParD(level)->recvProcessNeighborF3Y.back().memsizeIndex = sizeof(unsigned int) * tempRecv;
-						para->getParH(level)->recvProcessNeighborF3Y.back().memsizeGs = sizeof(real) * para->getParH(level)->recvProcessNeighborF3Y.back().numberOfGs;
-						para->getParD(level)->recvProcessNeighborF3Y.back().memsizeGs = sizeof(real) * para->getParH(level)->recvProcessNeighborF3Y.back().numberOfGs;
-						////////////////////////////////////////////////////////////////////////////////////////
-						//malloc on host and device
-						cudaMemoryManager->cudaAllocProcessNeighborF3Y(level, j);
-						////////////////////////////////////////////////////////////////////////////////////////
-						//init index arrays
-						builder->getSendIndices(para->getParH(level)->sendProcessNeighborF3Y[j].index, direction, level);
-						builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborF3Y[j].index, direction, level);
-						////////////////////////////////////////////////////////////////////////////////////////
-						cudaMemoryManager->cudaCopyProcessNeighborF3YIndex(level, j);
-						////////////////////////////////////////////////////////////////////////////////////////
-					}
-				}
-
-				if (direction == CommunicationDirections::MZ || direction == CommunicationDirections::PZ)
-				{
-                    int j = (int)para->getParH(level)->sendProcessNeighborF3Z.size();
+                    para->getParH(level)->sendProcessNeighborF3Y.emplace_back();
+                    para->getParD(level)->sendProcessNeighborF3Y.emplace_back();
+                    para->getParH(level)->recvProcessNeighborF3Y.emplace_back();
+                    para->getParD(level)->recvProcessNeighborF3Y.emplace_back();
+
+                    int tempSend = builder->getNumberOfSendIndices(direction, level);
+                    int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
+                    if (tempSend > 0) {
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // send
+                        std::cout << "size of Data for Y send buffer, Level " << level << " : " << tempSend
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborF3Y.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborF3Y.back().numberOfNodes = tempSend;
+                        para->getParD(level)->sendProcessNeighborF3Y.back().numberOfNodes = tempSend;
+                        para->getParH(level)->sendProcessNeighborF3Y.back().numberOfGs    = 6 * tempSend;
+                        para->getParD(level)->sendProcessNeighborF3Y.back().numberOfGs    = 6 * tempSend;
+                        para->getParH(level)->sendProcessNeighborF3Y.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParD(level)->sendProcessNeighborF3Y.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParH(level)->sendProcessNeighborF3Y.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->sendProcessNeighborF3Y.back().numberOfGs;
+                        para->getParD(level)->sendProcessNeighborF3Y.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->sendProcessNeighborF3Y.back().numberOfGs;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // recv
+                        std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborF3Y.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborF3Y.back().numberOfNodes = tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3Y.back().numberOfNodes = tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3Y.back().numberOfGs    = 6 * tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3Y.back().numberOfGs    = 6 * tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3Y.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3Y.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3Y.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->recvProcessNeighborF3Y.back().numberOfGs;
+                        para->getParD(level)->recvProcessNeighborF3Y.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->recvProcessNeighborF3Y.back().numberOfGs;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // malloc on host and device
+                        cudaMemoryManager->cudaAllocProcessNeighborF3Y(level, j);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // init index arrays
+                        builder->getSendIndices(para->getParH(level)->sendProcessNeighborF3Y[j].index, direction,
+                                                level);
+                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborF3Y[j].index, direction,
+                                                   level);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        cudaMemoryManager->cudaCopyProcessNeighborF3YIndex(level, j);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                    }
+                }
 
-					para->getParH(level)->sendProcessNeighborF3Z.emplace_back();
-					para->getParD(level)->sendProcessNeighborF3Z.emplace_back();
-					para->getParH(level)->recvProcessNeighborF3Z.emplace_back();
-					para->getParD(level)->recvProcessNeighborF3Z.emplace_back();
-
-					int tempSend = builder->getNumberOfSendIndices(direction, level);
-					int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
-					if (tempSend > 0)
-					{
-						////////////////////////////////////////////////////////////////////////////////////////
-						//send
-						std::cout << "size of Data for X send buffer, Level " << level << " : " << tempSend << std::endl;
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->sendProcessNeighborF3Z.back().rankNeighbor = builder->getCommunicationProcess(direction);
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->sendProcessNeighborF3Z.back().numberOfNodes = tempSend;
-						para->getParD(level)->sendProcessNeighborF3Z.back().numberOfNodes = tempSend;
-						para->getParH(level)->sendProcessNeighborF3Z.back().numberOfGs = 6 * tempSend;
-						para->getParD(level)->sendProcessNeighborF3Z.back().numberOfGs = 6 * tempSend;
-						para->getParH(level)->sendProcessNeighborF3Z.back().memsizeIndex = sizeof(unsigned int) * tempSend;
-						para->getParD(level)->sendProcessNeighborF3Z.back().memsizeIndex = sizeof(unsigned int) * tempSend;
-						para->getParH(level)->sendProcessNeighborF3Z.back().memsizeGs = sizeof(real) * para->getParH(level)->sendProcessNeighborF3Z.back().numberOfGs;
-						para->getParD(level)->sendProcessNeighborF3Z.back().memsizeGs = sizeof(real) * para->getParH(level)->sendProcessNeighborF3Z.back().numberOfGs;
-						////////////////////////////////////////////////////////////////////////////////////////
-						//recv
-						std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv << std::endl;
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->recvProcessNeighborF3Z.back().rankNeighbor = builder->getCommunicationProcess(direction);
-						////////////////////////////////////////////////////////////////////////////////////////
-						para->getParH(level)->recvProcessNeighborF3Z.back().numberOfNodes = tempRecv;
-						para->getParD(level)->recvProcessNeighborF3Z.back().numberOfNodes = tempRecv;
-						para->getParH(level)->recvProcessNeighborF3Z.back().numberOfGs = 6 * tempRecv;
-						para->getParD(level)->recvProcessNeighborF3Z.back().numberOfGs = 6 * tempRecv;
-						para->getParH(level)->recvProcessNeighborF3Z.back().memsizeIndex = sizeof(unsigned int) * tempRecv;
-						para->getParD(level)->recvProcessNeighborF3Z.back().memsizeIndex = sizeof(unsigned int) * tempRecv;
-						para->getParH(level)->recvProcessNeighborF3Z.back().memsizeGs = sizeof(real) * para->getParH(level)->recvProcessNeighborF3Z.back().numberOfGs;
-						para->getParD(level)->recvProcessNeighborF3Z.back().memsizeGs = sizeof(real) * para->getParH(level)->recvProcessNeighborF3Z.back().numberOfGs;
-						////////////////////////////////////////////////////////////////////////////////////////
-						//malloc on host and device
-						cudaMemoryManager->cudaAllocProcessNeighborF3Z(level, j);
-						////////////////////////////////////////////////////////////////////////////////////////
-						//init index arrays
-						builder->getSendIndices(para->getParH(level)->sendProcessNeighborF3Z[j].index, direction, level);
-						builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborF3Z[j].index, direction, level);
-						////////////////////////////////////////////////////////////////////////////////////////
-						cudaMemoryManager->cudaCopyProcessNeighborF3ZIndex(level, j);
-						////////////////////////////////////////////////////////////////////////////////////////
-					}
-				}
-
-			}
-		}
-	}
+                if (direction == CommunicationDirections::MZ || direction == CommunicationDirections::PZ) {
+                    int j = (int)para->getParH(level)->sendProcessNeighborF3Z.size();
 
+                    para->getParH(level)->sendProcessNeighborF3Z.emplace_back();
+                    para->getParD(level)->sendProcessNeighborF3Z.emplace_back();
+                    para->getParH(level)->recvProcessNeighborF3Z.emplace_back();
+                    para->getParD(level)->recvProcessNeighborF3Z.emplace_back();
+
+                    int tempSend = builder->getNumberOfSendIndices(direction, level);
+                    int tempRecv = builder->getNumberOfReceiveIndices(direction, level);
+                    if (tempSend > 0) {
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // send
+                        std::cout << "size of Data for Z send buffer, Level " << level << " : " << tempSend
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborF3Z.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->sendProcessNeighborF3Z.back().numberOfNodes = tempSend;
+                        para->getParD(level)->sendProcessNeighborF3Z.back().numberOfNodes = tempSend;
+                        para->getParH(level)->sendProcessNeighborF3Z.back().numberOfGs    = 6 * tempSend;
+                        para->getParD(level)->sendProcessNeighborF3Z.back().numberOfGs    = 6 * tempSend;
+                        para->getParH(level)->sendProcessNeighborF3Z.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParD(level)->sendProcessNeighborF3Z.back().memsizeIndex =
+                            sizeof(unsigned int) * tempSend;
+                        para->getParH(level)->sendProcessNeighborF3Z.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->sendProcessNeighborF3Z.back().numberOfGs;
+                        para->getParD(level)->sendProcessNeighborF3Z.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->sendProcessNeighborF3Z.back().numberOfGs;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // recv
+                        std::cout << "size of Data for X receive buffer, Level " << level << " : " << tempRecv
+                                  << std::endl;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborF3Z.back().rankNeighbor =
+                            builder->getCommunicationProcess(direction);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        para->getParH(level)->recvProcessNeighborF3Z.back().numberOfNodes = tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3Z.back().numberOfNodes = tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3Z.back().numberOfGs    = 6 * tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3Z.back().numberOfGs    = 6 * tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3Z.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParD(level)->recvProcessNeighborF3Z.back().memsizeIndex =
+                            sizeof(unsigned int) * tempRecv;
+                        para->getParH(level)->recvProcessNeighborF3Z.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->recvProcessNeighborF3Z.back().numberOfGs;
+                        para->getParD(level)->recvProcessNeighborF3Z.back().memsizeGs =
+                            sizeof(real) * para->getParH(level)->recvProcessNeighborF3Z.back().numberOfGs;
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // malloc on host and device
+                        cudaMemoryManager->cudaAllocProcessNeighborF3Z(level, j);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        // init index arrays
+                        builder->getSendIndices(para->getParH(level)->sendProcessNeighborF3Z[j].index, direction,
+                                                level);
+                        builder->getReceiveIndices(para->getParH(level)->recvProcessNeighborF3Z[j].index, direction,
+                                                   level);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                        cudaMemoryManager->cudaCopyProcessNeighborF3ZIndex(level, j);
+                        ////////////////////////////////////////////////////////////////////////////////////////
+                    }
+                }
+            }
+        }
+    }
 }
 
-
 void GridGenerator::allocArrays_BoundaryQs()
 {
 	std::cout << "------read BoundaryQs-------" << std::endl;
@@ -1032,16 +1112,23 @@ void GridGenerator::allocArrays_OffsetScale()
         builder->getOffsetCF(para->getParH(level)->offCF.xOffCF, para->getParH(level)->offCF.yOffCF, para->getParH(level)->offCF.zOffCF, level);
         builder->getOffsetFC(para->getParH(level)->offFC.xOffFC, para->getParH(level)->offFC.yOffFC, para->getParH(level)->offFC.zOffFC, level);
         builder->getGridInterfaceIndices(para->getParH(level)->intCF.ICellCFC, para->getParH(level)->intCF.ICellCFF, para->getParH(level)->intFC.ICellFCC, para->getParH(level)->intFC.ICellFCF, level);
+        
+        if (para->getUseStreams() || para->getNumprocs() > 1) {
+            // split fine-to-coarse indices into border and bulk
+            indexRearrangement->splitFineToCoarseIntoBorderAndBulk(level);
+            // split coarse-to-fine indices into border and bulk
+            indexRearrangement->splitCoarseToFineIntoBorderAndBulk(level);
+        }
         ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
         //copy
 		cudaMemoryManager->cudaCopyInterfaceCF(level);
 		cudaMemoryManager->cudaCopyInterfaceFC(level);
 		cudaMemoryManager->cudaCopyInterfaceOffCF(level);
 		cudaMemoryManager->cudaCopyInterfaceOffFC(level);
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     }
 }
 
-
 void GridGenerator::setDimensions()
 {
 	//std::vector<int> localGridNX(1);
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h
index 79f7f2b6a1d1fc45217cf5e28df4f5b599f5c0e4..aad9b2e05f462319440f3a4a93a67b54a123b426 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h
@@ -11,6 +11,7 @@
 
 class Parameter;
 class GridBuilder;
+class IndexRearrangementForStreams;
 
 class GridGenerator
 	: public GridProvider
@@ -20,15 +21,19 @@ private:
 	std::vector<std::string> channelBoundaryConditions;
 
 	std::shared_ptr<GridBuilder> builder;
+    std::unique_ptr<IndexRearrangementForStreams> indexRearrangement;
 
 public:
-    VIRTUALFLUIDS_GPU_EXPORT GridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaManager);
+    VIRTUALFLUIDS_GPU_EXPORT GridGenerator(std::shared_ptr<GridBuilder> builder, std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaManager, vf::gpu::Communicator& communicator);
 	VIRTUALFLUIDS_GPU_EXPORT virtual ~GridGenerator();
 
 	void allocArrays_CoordNeighborGeo() override;
-	void allocArrays_BoundaryValues() override;
+    void allocArrays_BoundaryValues() override;
+
 	void allocArrays_BoundaryQs() override;
     void allocArrays_OffsetScale() override;
+    void allocArrays_fluidNodeIndices() override;
+    void allocArrays_fluidNodeIndicesBorder() override;
 
 	virtual void setDimensions() override;
 	virtual void setBoundingBox() override;
@@ -58,7 +63,8 @@ private:
 	void setSizeGeoQs(unsigned int level) const;
 	void setQ27Size(QforBoundaryConditions &Q, real* QQ, unsigned int sizeQ) const;
 	bool hasQs(int channelSide, unsigned int level) const;
-
+    
+    void initalValuesDomainDecompostion();
 public:
     void initalGridInformations() override;
 
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cab83cc4654f96d317bfbcf28b8f4a27620b5001
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.cpp
@@ -0,0 +1,619 @@
+#include "IndexRearrangementForStreams.h"
+
+#include "Communication/Communicator.h"
+#include "Parameter/Parameter.h"
+#include <GridGenerator/grid/Grid.h>
+#include <GridGenerator/grid/GridBuilder/GridBuilder.h>
+
+#include <algorithm>
+#include <iostream>
+
+IndexRearrangementForStreams::IndexRearrangementForStreams(std::shared_ptr<Parameter> para,
+                                                           std::shared_ptr<GridBuilder> builder, vf::gpu::Communicator& communicator)
+    : para(para), builder(builder), communicator(communicator)
+{
+}
+
+void IndexRearrangementForStreams::initCommunicationArraysForCommAfterFinetoCoarseX(const uint &level,
+                                                                                    int indexOfProcessNeighbor,
+                                                                                    int direction)
+{
+    // init send indices for communication after coarse to fine
+    std::cout << "communication: reorder send indices X ";
+    para->initProcessNeighborsAfterFtoCX(level);
+    std::vector<uint> sendIndicesForCommAfterFtoCPositions;
+    reorderSendIndicesForCommAfterFtoCX(direction, level, indexOfProcessNeighbor, sendIndicesForCommAfterFtoCPositions);
+    para->setSendProcessNeighborsAfterFtoCX(
+        para->getParH(level)->sendProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].numberOfNodes, level,
+        indexOfProcessNeighbor);
+
+    // send sendIndicesForCommAfterFtoCPositions to receiving process and receive recvIndicesForCommAfterFtoCPositions
+    // from sending process
+    std::cout << "mpi send and receive ";
+    std::vector<uint> recvIndicesForCommAfterFtoCPositions;
+    recvIndicesForCommAfterFtoCPositions.resize(
+        (size_t)para->getParH(level)->sendProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].numberOfNodes *
+        2); // give vector an arbitraty size (larger than needed) // TODO: Find a better way
+    communicator.exchangeIndices(recvIndicesForCommAfterFtoCPositions.data(), (int)recvIndicesForCommAfterFtoCPositions.size(),
+                          para->getParH(level)->recvProcessNeighborX[indexOfProcessNeighbor].rankNeighbor,
+                          sendIndicesForCommAfterFtoCPositions.data(), (int)sendIndicesForCommAfterFtoCPositions.size(),
+                          para->getParH(level)->sendProcessNeighborX[indexOfProcessNeighbor].rankNeighbor);
+
+    // resize receiving vector to correct size
+    auto it = std::unique(recvIndicesForCommAfterFtoCPositions.begin(), recvIndicesForCommAfterFtoCPositions.end());
+    recvIndicesForCommAfterFtoCPositions.erase(std::prev(it, 1),
+                                               recvIndicesForCommAfterFtoCPositions.end()); // TODO: Find a better way
+
+    // init receive indices for communication after coarse to fine
+    std::cout << "reorder receive indices ";
+    reorderRecvIndicesForCommAfterFtoCX(direction, level, indexOfProcessNeighbor, recvIndicesForCommAfterFtoCPositions);
+    para->setRecvProcessNeighborsAfterFtoCX(
+        para->getParH(level)->recvProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].numberOfNodes, level,
+        indexOfProcessNeighbor);
+    copyProcessNeighborToCommAfterFtoCX(level, indexOfProcessNeighbor);
+
+    std::cout << "done." << std::endl;
+}
+
+void IndexRearrangementForStreams::initCommunicationArraysForCommAfterFinetoCoarseY(const uint &level,
+                                                                                    int indexOfProcessNeighbor,
+                                                                                    int direction)
+{
+    // init send indices for communication after coarse to fine
+    std::cout << "communication: reorder send indices Y ";
+    para->initProcessNeighborsAfterFtoCY(level);
+    std::vector<uint> sendIndicesForCommAfterFtoCPositions;
+    reorderSendIndicesForCommAfterFtoCY(direction, level, indexOfProcessNeighbor, sendIndicesForCommAfterFtoCPositions);
+    para->setSendProcessNeighborsAfterFtoCY(
+        para->getParH(level)->sendProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].numberOfNodes, level,
+        indexOfProcessNeighbor);
+
+    // send sendIndicesForCommAfterFtoCPositions to receiving process and receive recvIndicesForCommAfterFtoCPositions
+    // from sending process
+    std::cout << "mpi send and receive ";
+    std::vector<uint> recvIndicesForCommAfterFtoCPositions;
+    recvIndicesForCommAfterFtoCPositions.resize(
+        (size_t)para->getParH(level)->sendProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].numberOfNodes *
+        2); // give vector an arbitraty size (larger than needed) // TODO: Find a better way
+    communicator.exchangeIndices(recvIndicesForCommAfterFtoCPositions.data(), (int)recvIndicesForCommAfterFtoCPositions.size(),
+                          para->getParH(level)->recvProcessNeighborY[indexOfProcessNeighbor].rankNeighbor,
+                          sendIndicesForCommAfterFtoCPositions.data(), (int)sendIndicesForCommAfterFtoCPositions.size(),
+                          para->getParH(level)->sendProcessNeighborY[indexOfProcessNeighbor].rankNeighbor);
+
+    // resize receiving vector to correct size
+    auto it = std::unique(recvIndicesForCommAfterFtoCPositions.begin(), recvIndicesForCommAfterFtoCPositions.end());
+    recvIndicesForCommAfterFtoCPositions.erase(std::prev(it, 1),
+                                               recvIndicesForCommAfterFtoCPositions.end()); // TODO: Find a better way
+
+    // init receive indices for communication after coarse to fine
+    std::cout << "reorder receive indices ";
+    reorderRecvIndicesForCommAfterFtoCY(direction, level, indexOfProcessNeighbor, recvIndicesForCommAfterFtoCPositions);
+    para->setRecvProcessNeighborsAfterFtoCY(
+        para->getParH(level)->recvProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].numberOfNodes, level,
+        indexOfProcessNeighbor);
+
+    copyProcessNeighborToCommAfterFtoCY(level, indexOfProcessNeighbor);
+
+    std::cout << "done." << std::endl;
+}
+
+void IndexRearrangementForStreams::initCommunicationArraysForCommAfterFinetoCoarseZ(const uint &level,
+                                                                                    int indexOfProcessNeighbor,
+                                                                                    int direction)
+{
+    // init send indices for communication after coarse to fine
+    std::cout << "communication: reorder send indices Z ";
+    para->initProcessNeighborsAfterFtoCZ(level);
+    std::vector<uint> sendIndicesForCommAfterFtoCPositions;
+    reorderSendIndicesForCommAfterFtoCZ(direction, level, indexOfProcessNeighbor, sendIndicesForCommAfterFtoCPositions);
+    para->setSendProcessNeighborsAfterFtoCZ(
+        para->getParH(level)->sendProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].numberOfNodes, level,
+        indexOfProcessNeighbor);
+
+    // send sendIndicesForCommAfterFtoCPositions to receiving process and receive recvIndicesForCommAfterFtoCPositions
+    // from sending process
+    std::cout << "mpi send and receive ";
+    std::vector<uint> recvIndicesForCommAfterFtoCPositions;
+    recvIndicesForCommAfterFtoCPositions.resize(
+        (size_t)para->getParH(level)->sendProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].numberOfNodes *
+        2); // give vector an arbitraty size (larger than needed) // TODO: Find a better way
+    communicator.exchangeIndices(recvIndicesForCommAfterFtoCPositions.data(), (int)recvIndicesForCommAfterFtoCPositions.size(),
+                          para->getParH(level)->recvProcessNeighborZ[indexOfProcessNeighbor].rankNeighbor,
+                          sendIndicesForCommAfterFtoCPositions.data(), (int)sendIndicesForCommAfterFtoCPositions.size(),
+                          para->getParH(level)->sendProcessNeighborZ[indexOfProcessNeighbor].rankNeighbor);
+
+    // resize receiving vector to correct size
+    auto it = std::unique(recvIndicesForCommAfterFtoCPositions.begin(), recvIndicesForCommAfterFtoCPositions.end());
+    recvIndicesForCommAfterFtoCPositions.erase(std::prev(it, 1),
+                                               recvIndicesForCommAfterFtoCPositions.end()); // TODO: Find a better way
+
+    // init receive indices for communication after coarse to fine
+    std::cout << "reorder receive indices ";
+    reorderRecvIndicesForCommAfterFtoCZ(direction, level, indexOfProcessNeighbor, recvIndicesForCommAfterFtoCPositions);
+    para->setRecvProcessNeighborsAfterFtoCZ(
+        para->getParH(level)->recvProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].numberOfNodes, level,
+        indexOfProcessNeighbor);
+
+    copyProcessNeighborToCommAfterFtoCZ(level, indexOfProcessNeighbor);
+
+    std::cout << "done." << std::endl;
+}
+
+void IndexRearrangementForStreams::copyProcessNeighborToCommAfterFtoCX(const uint &level, int indexOfProcessNeighbor)
+{
+    // init f[0]*
+    para->getParD(level)->sendProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].f[0] =
+        para->getParD(level)->sendProcessNeighborX[indexOfProcessNeighbor].f[0];
+    para->getParH(level)->sendProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].f[0] =
+        para->getParH(level)->sendProcessNeighborX[indexOfProcessNeighbor].f[0];
+    para->getParD(level)->recvProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].f[0] =
+        para->getParD(level)->recvProcessNeighborX[indexOfProcessNeighbor].f[0];
+    para->getParH(level)->recvProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].f[0] =
+        para->getParH(level)->recvProcessNeighborX[indexOfProcessNeighbor].f[0];
+
+    // init index*
+    para->getParD(level)->sendProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].index =
+        para->getParD(level)->sendProcessNeighborX[indexOfProcessNeighbor].index;
+    para->getParH(level)->sendProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].index =
+        para->getParH(level)->sendProcessNeighborX[indexOfProcessNeighbor].index;
+    para->getParD(level)->recvProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].index =
+        para->getParD(level)->recvProcessNeighborX[indexOfProcessNeighbor].index;
+    para->getParH(level)->recvProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].index =
+        para->getParH(level)->recvProcessNeighborX[indexOfProcessNeighbor].index;
+
+    // rank neighbor
+    para->getParH(level)->sendProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].rankNeighbor =
+        para->getParH(level)->sendProcessNeighborX[indexOfProcessNeighbor].rankNeighbor;
+    para->getParH(level)->recvProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].rankNeighbor =
+        para->getParH(level)->recvProcessNeighborX[indexOfProcessNeighbor].rankNeighbor;
+}
+
+void IndexRearrangementForStreams::copyProcessNeighborToCommAfterFtoCY(const uint &level, int indexOfProcessNeighbor)
+{
+    // init f[0]*
+    para->getParD(level)->sendProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].f[0] =
+        para->getParD(level)->sendProcessNeighborY[indexOfProcessNeighbor].f[0];
+    para->getParH(level)->sendProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].f[0] =
+        para->getParH(level)->sendProcessNeighborY[indexOfProcessNeighbor].f[0];
+    para->getParD(level)->recvProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].f[0] =
+        para->getParD(level)->recvProcessNeighborY[indexOfProcessNeighbor].f[0];
+    para->getParH(level)->recvProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].f[0] =
+        para->getParH(level)->recvProcessNeighborY[indexOfProcessNeighbor].f[0];
+
+    // init index*
+    para->getParD(level)->sendProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].index =
+        para->getParD(level)->sendProcessNeighborY[indexOfProcessNeighbor].index;
+    para->getParH(level)->sendProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].index =
+        para->getParH(level)->sendProcessNeighborY[indexOfProcessNeighbor].index;
+    para->getParD(level)->recvProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].index =
+        para->getParD(level)->recvProcessNeighborY[indexOfProcessNeighbor].index;
+    para->getParH(level)->recvProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].index =
+        para->getParH(level)->recvProcessNeighborY[indexOfProcessNeighbor].index;
+
+    // rank neighbor
+    para->getParH(level)->sendProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].rankNeighbor =
+        para->getParH(level)->sendProcessNeighborY[indexOfProcessNeighbor].rankNeighbor;
+    para->getParH(level)->recvProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].rankNeighbor =
+        para->getParH(level)->recvProcessNeighborY[indexOfProcessNeighbor].rankNeighbor;
+}
+
+void IndexRearrangementForStreams::copyProcessNeighborToCommAfterFtoCZ(const uint &level, int indexOfProcessNeighbor)
+{
+    // init f[0]*
+    para->getParD(level)->sendProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].f[0] =
+        para->getParD(level)->sendProcessNeighborZ[indexOfProcessNeighbor].f[0];
+    para->getParH(level)->sendProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].f[0] =
+        para->getParH(level)->sendProcessNeighborZ[indexOfProcessNeighbor].f[0];
+    para->getParD(level)->recvProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].f[0] =
+        para->getParD(level)->recvProcessNeighborZ[indexOfProcessNeighbor].f[0];
+    para->getParH(level)->recvProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].f[0] =
+        para->getParH(level)->recvProcessNeighborZ[indexOfProcessNeighbor].f[0];
+
+    // init index*
+    para->getParD(level)->sendProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].index =
+        para->getParD(level)->sendProcessNeighborZ[indexOfProcessNeighbor].index;
+    para->getParH(level)->sendProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].index =
+        para->getParH(level)->sendProcessNeighborZ[indexOfProcessNeighbor].index;
+    para->getParD(level)->recvProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].index =
+        para->getParD(level)->recvProcessNeighborZ[indexOfProcessNeighbor].index;
+    para->getParH(level)->recvProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].index =
+        para->getParH(level)->recvProcessNeighborZ[indexOfProcessNeighbor].index;
+
+    // rank neighbor
+    para->getParH(level)->sendProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].rankNeighbor =
+        para->getParH(level)->sendProcessNeighborZ[indexOfProcessNeighbor].rankNeighbor;
+    para->getParH(level)->recvProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].rankNeighbor =
+        para->getParH(level)->recvProcessNeighborZ[indexOfProcessNeighbor].rankNeighbor;
+}
+
+void IndexRearrangementForStreams::reorderSendIndicesForCommAfterFtoCX(
+    int direction, int level, int indexOfProcessNeighbor, std::vector<uint> &sendIndicesForCommAfterFtoCPositions)
+{
+    int *sendIndices = para->getParH(level)->sendProcessNeighborX[indexOfProcessNeighbor].index;
+    int &numberOfSendNodesAfterFtoC =
+        para->getParH(level)->sendProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].numberOfNodes;
+    reorderSendIndicesForCommAfterFtoC(sendIndices, numberOfSendNodesAfterFtoC, direction, level,
+                                       sendIndicesForCommAfterFtoCPositions);
+}
+
+void IndexRearrangementForStreams::reorderSendIndicesForCommAfterFtoCY(
+    int direction, int level, int indexOfProcessNeighbor, std::vector<uint> &sendIndicesForCommAfterFtoCPositions)
+{
+    int *sendIndices = para->getParH(level)->sendProcessNeighborY[indexOfProcessNeighbor].index;
+    int &numberOfSendNodesAfterFtoC =
+        para->getParH(level)->sendProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].numberOfNodes;
+    reorderSendIndicesForCommAfterFtoC(sendIndices, numberOfSendNodesAfterFtoC, direction, level,
+                                       sendIndicesForCommAfterFtoCPositions);
+}
+
+void IndexRearrangementForStreams::reorderSendIndicesForCommAfterFtoCZ(
+    int direction, int level, int indexOfProcessNeighbor, std::vector<uint> &sendIndicesForCommAfterFtoCPositions)
+{
+    int *sendIndices = para->getParH(level)->sendProcessNeighborZ[indexOfProcessNeighbor].index;
+    int &numberOfSendNodesAfterFtoC =
+        para->getParH(level)->sendProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].numberOfNodes;
+    reorderSendIndicesForCommAfterFtoC(sendIndices, numberOfSendNodesAfterFtoC, direction, level,
+                                       sendIndicesForCommAfterFtoCPositions);
+}
+
+void IndexRearrangementForStreams::reorderSendIndicesForCommAfterFtoC(
+    int *sendIndices, int &numberOfSendNodesAfterFtoC, int direction, int level,
+    std::vector<uint> &sendIndicesForCommAfterFtoCPositions)
+{
+    *logging::out << logging::Logger::INFO_INTERMEDIATE
+                  << "reorder send indices for communication after fine to coarse: level: " << level
+                  << " direction: " << direction;
+    if (para->getParH(level)->intCF.kCF == 0 || para->getParH(level)->intFC.kFC == 0)
+        *logging::out << logging::Logger::LOGGER_ERROR
+                      << "reorderSendIndicesForCommAfterFtoC(): para->getParH(level)->intCF needs to be inititalized before calling "
+                         "this function "
+                      << "\n";
+
+    int sparseIndexSend;
+    std::vector<int> sendIndicesAfterFtoC;
+    std::vector<int> sendIndicesOther;
+    uint numberOfSendIndices = builder->getNumberOfSendIndices(direction, level);
+
+    // iCellFCC
+    for (uint posInSendIndices = 0; posInSendIndices < numberOfSendIndices; posInSendIndices++) {
+        sparseIndexSend = sendIndices[posInSendIndices];
+        if (isSparseIndexInICellFCC(para->getParH(level)->intFC.kFC, sparseIndexSend, level)) {
+            addUniqueIndexToCommunicationVectors(sendIndicesAfterFtoC, sparseIndexSend,
+                                                 sendIndicesForCommAfterFtoCPositions, posInSendIndices);
+        }
+    }
+
+    // iCellCFC
+    std::vector<uint> nodesCFC;
+    aggregateNodesInICellCFC(level, nodesCFC);
+    for (auto sparseIndex : nodesCFC)
+        findIfSparseIndexIsInSendIndicesAndAddToCommVectors(sparseIndex, sendIndices, numberOfSendIndices,
+                                                            sendIndicesAfterFtoC, sendIndicesForCommAfterFtoCPositions);
+
+    numberOfSendNodesAfterFtoC = (int)sendIndicesAfterFtoC.size();
+
+    findIndicesNotInCommAfterFtoC(numberOfSendIndices, sendIndices, sendIndicesAfterFtoC, sendIndicesOther);
+
+    // copy new vectors back to sendIndices array
+    for (int i = 0; i < numberOfSendNodesAfterFtoC; i++)
+        sendIndices[i] = sendIndicesAfterFtoC[i];
+    for (uint i = 0; i < (uint)sendIndicesOther.size(); i++)
+        sendIndices[i + numberOfSendNodesAfterFtoC] = sendIndicesOther[i];
+
+    *logging::out << logging::Logger::INFO_INTERMEDIATE << "... Process "
+                  << " " << communicator.getPID()
+                  << " numberOfSendNodesAfterFtoC: " << numberOfSendNodesAfterFtoC << "\n ";
+
+    if (numberOfSendNodesAfterFtoC + sendIndicesOther.size() != numberOfSendIndices) {
+        *logging::out << logging::Logger::LOGGER_ERROR
+                      << "reorderSendIndicesForCommAfterFtoC(): incorrect number of nodes"
+                      << "\n";
+        std::cout << "numberOfSendNodesAfterFtoC = " << numberOfSendNodesAfterFtoC
+                  << ", sendOrIndicesOther.size() = " << sendIndicesOther.size()
+                  << ", numberOfSendOrRecvIndices = " << numberOfSendIndices << std::endl;
+    }
+}
+
+bool IndexRearrangementForStreams::isSparseIndexInICellFCC(uint sizeOfICellFCC, int sparseIndex, int level)
+{
+    for (uint j = 0; j < sizeOfICellFCC; j++) {
+        if (sparseIndex < 0)
+            return false;
+        if (para->getParH(level)->intFC.ICellFCC[j] == (uint)sparseIndex) {
+            return true;
+        }
+    }
+    return false;
+}
+
+void IndexRearrangementForStreams::aggregateNodesInICellCFC(int level, std::vector<uint> &nodesCFC)
+{
+    uint sparseIndex;
+    uint *neighborX = para->getParH(level)->neighborX_SP;
+    uint *neighborY = para->getParH(level)->neighborY_SP;
+    uint *neighborZ = para->getParH(level)->neighborZ_SP;
+
+    for (uint x = 0; x < para->getParH(level)->intCF.kCF; x++) {
+        sparseIndex = para->getParH(level)->intCF.ICellCFC[x];
+        nodesCFC.push_back(sparseIndex);
+        nodesCFC.push_back(neighborX[sparseIndex]);
+        nodesCFC.push_back(neighborY[sparseIndex]);
+        nodesCFC.push_back(neighborZ[sparseIndex]);
+        nodesCFC.push_back(neighborY[neighborX[sparseIndex]]);
+        nodesCFC.push_back(neighborZ[neighborX[sparseIndex]]);
+        nodesCFC.push_back(neighborZ[neighborY[sparseIndex]]);
+        nodesCFC.push_back(neighborZ[neighborY[neighborX[sparseIndex]]]);
+    }
+
+    // remove duplicate nodes
+    std::sort(nodesCFC.begin(), nodesCFC.end());
+    auto iterator = std::unique(nodesCFC.begin(), nodesCFC.end());
+    nodesCFC.erase(iterator, nodesCFC.end());
+}
+
+void IndexRearrangementForStreams::addUniqueIndexToCommunicationVectors(
+    std::vector<int> &sendIndicesAfterFtoC, int &sparseIndexSend,
+    std::vector<unsigned int> &sendIndicesForCommAfterFtoCPositions, uint &posInSendIndices) const
+{
+    // add index to corresponding vectors, but omit indices which are already in sendIndicesAfterFtoC
+    if (std::find(sendIndicesAfterFtoC.begin(), sendIndicesAfterFtoC.end(), sparseIndexSend) ==
+        sendIndicesAfterFtoC.end()) {
+        sendIndicesAfterFtoC.push_back(sparseIndexSend);
+        sendIndicesForCommAfterFtoCPositions.push_back(posInSendIndices);
+    }
+}
+
+void IndexRearrangementForStreams::findIfSparseIndexIsInSendIndicesAndAddToCommVectors(
+    int sparseIndex, int *sendIndices, uint numberOfSendIndices, std::vector<int> &sendIndicesAfterFtoC,
+    std::vector<uint> &sendIndicesForCommAfterFtoCPositions) const
+{
+    int sparseIndexSend;
+    for (uint posInSendIndices = 0; posInSendIndices < numberOfSendIndices; posInSendIndices++) {
+        sparseIndexSend = sendIndices[posInSendIndices];
+        if (sparseIndex == sparseIndexSend) {
+            addUniqueIndexToCommunicationVectors(sendIndicesAfterFtoC, sparseIndex,
+                                                 sendIndicesForCommAfterFtoCPositions, posInSendIndices);
+            break;
+        }
+    }
+}
+
+void IndexRearrangementForStreams::findIndicesNotInCommAfterFtoC(const uint &numberOfSendOrRecvIndices,
+                                                                 int *sendOrReceiveIndices,
+                                                                 std::vector<int> &sendOrReceiveIndicesAfterFtoC,
+                                                                 std::vector<int> &sendOrIndicesOther)
+{
+    int sparseIndexSend;
+    for (uint posInSendIndices = 0; posInSendIndices < numberOfSendOrRecvIndices; posInSendIndices++) {
+        sparseIndexSend = sendOrReceiveIndices[posInSendIndices];
+        if (std::find(sendOrReceiveIndicesAfterFtoC.begin(), sendOrReceiveIndicesAfterFtoC.end(), sparseIndexSend) ==
+            sendOrReceiveIndicesAfterFtoC.end())
+            sendOrIndicesOther.push_back(sparseIndexSend);
+    }
+}
+
+void IndexRearrangementForStreams::reorderRecvIndicesForCommAfterFtoCX(
+    int direction, int level, int indexOfProcessNeighbor, std::vector<uint> &sendIndicesForCommAfterFtoCPositions)
+{
+    int *recvIndices                    = para->getParH(level)->recvProcessNeighborX[indexOfProcessNeighbor].index;
+    int &numberOfRecvNodesAfterFtoC = para->getParH(level)->recvProcessNeighborsAfterFtoCX[indexOfProcessNeighbor].numberOfNodes;
+    reorderRecvIndicesForCommAfterFtoC(recvIndices, numberOfRecvNodesAfterFtoC, direction, level,
+                                       sendIndicesForCommAfterFtoCPositions);
+}
+
+void IndexRearrangementForStreams::reorderRecvIndicesForCommAfterFtoCY(
+    int direction, int level, int indexOfProcessNeighbor, std::vector<uint> &sendIndicesForCommAfterFtoCPositions)
+{
+    int *recvIndices                    = para->getParH(level)->recvProcessNeighborY[indexOfProcessNeighbor].index;
+    int &numberOfRecvNodesAfterFtoC = para->getParH(level)->recvProcessNeighborsAfterFtoCY[indexOfProcessNeighbor].numberOfNodes;
+    reorderRecvIndicesForCommAfterFtoC(recvIndices, numberOfRecvNodesAfterFtoC, direction, level,
+                                       sendIndicesForCommAfterFtoCPositions);
+}
+
+void IndexRearrangementForStreams::reorderRecvIndicesForCommAfterFtoCZ(
+    int direction, int level, int indexOfProcessNeighbor, std::vector<uint> &sendIndicesForCommAfterFtoCPositions)
+{
+    int *recvIndices = para->getParH(level)->recvProcessNeighborZ[indexOfProcessNeighbor].index;
+    int &numberOfRecvNodesAfterFtoC =
+        para->getParH(level)->recvProcessNeighborsAfterFtoCZ[indexOfProcessNeighbor].numberOfNodes;
+    reorderRecvIndicesForCommAfterFtoC(recvIndices, numberOfRecvNodesAfterFtoC, direction, level,
+                                       sendIndicesForCommAfterFtoCPositions);
+}
+
+void IndexRearrangementForStreams::reorderRecvIndicesForCommAfterFtoC(
+    int *recvIndices, int &numberOfRecvNodesAfterFtoC, int direction, int level,
+    std::vector<uint> &sendIndicesForCommAfterFtoCPositions)
+{
+    *logging::out << logging::Logger::INFO_INTERMEDIATE
+                  << "reorder receive indices for communication after fine to coarse: level: " << level
+                  << " direction: " << direction;
+    if (sendIndicesForCommAfterFtoCPositions.size() == 0)
+        *logging::out << logging::Logger::LOGGER_ERROR
+                      << "reorderRecvIndicesForCommAfterFtoC(): sendIndicesForCommAfterFtoCPositions is empty."
+                      << "\n";
+
+    uint numberOfRecvIndices = builder->getNumberOfReceiveIndices(direction, level);
+    std::vector<int> recvIndicesAfterFtoC;
+    std::vector<int> recvIndicesOther;
+
+    // find recvIndices for Communication after fine to coarse
+    for (uint vectorPos : sendIndicesForCommAfterFtoCPositions)
+        recvIndicesAfterFtoC.push_back(recvIndices[vectorPos]);
+
+    findIndicesNotInCommAfterFtoC(numberOfRecvIndices, recvIndices, recvIndicesAfterFtoC, recvIndicesOther);
+
+    numberOfRecvNodesAfterFtoC = (int)recvIndicesAfterFtoC.size();
+
+    // copy new vectors back to sendIndices array
+    for (int i = 0; i < numberOfRecvNodesAfterFtoC; i++)
+        recvIndices[i] = recvIndicesAfterFtoC[i];
+    for (uint i = 0; i < (uint)recvIndicesOther.size(); i++)
+        recvIndices[i + numberOfRecvNodesAfterFtoC] = recvIndicesOther[i];
+
+    *logging::out << logging::Logger::INFO_INTERMEDIATE << "... Process "
+                  << " " << communicator.getPID()
+                  << " numberOfRecvNodesAfterFtoC: " << numberOfRecvNodesAfterFtoC << "\n ";
+
+    if (numberOfRecvNodesAfterFtoC + recvIndicesOther.size() != numberOfRecvIndices) {
+        *logging::out << logging::Logger::LOGGER_ERROR
+                      << "reorderRecvIndicesForCommAfterFtoC(): incorrect number of nodes"
+                      << "\n";
+        std::cout << "numberOfRecvNodesAfterFtoC = " << numberOfRecvNodesAfterFtoC
+                  << ", recvIndicesOther.size() = " << recvIndicesOther.size()
+                  << ", numberOfRecvIndices = " << numberOfRecvIndices << std::endl;
+    }
+}
+
+void IndexRearrangementForStreams::splitFineToCoarseIntoBorderAndBulk(const uint &level)
+{
+    this->getGridInterfaceIndicesBorderBulkFC(level);
+
+    para->getParD(level)->intFCBorder.kFC      = para->getParH(level)->intFCBorder.kFC;
+    para->getParD(level)->intFCBulk.kFC        = para->getParH(level)->intFCBulk.kFC;
+    para->getParD(level)->intFCBorder.ICellFCC = para->getParD(level)->intFC.ICellFCC;
+    para->getParD(level)->intFCBulk.ICellFCC =
+        para->getParD(level)->intFCBorder.ICellFCC + para->getParD(level)->intFCBorder.kFC;
+    para->getParD(level)->intFCBorder.ICellFCF = para->getParD(level)->intFC.ICellFCF;
+    para->getParD(level)->intFCBulk.ICellFCF =
+        para->getParD(level)->intFCBorder.ICellFCF + para->getParD(level)->intFCBorder.kFC;
+}
+
+void IndexRearrangementForStreams::getGridInterfaceIndicesBorderBulkFC(int level)
+{
+    // create some local variables for better readability
+    uint *iCellFccAll = para->getParH(level)->intFC.ICellFCC;
+    uint *iCellFcfAll = para->getParH(level)->intFC.ICellFCF;
+    auto grid         = this->builder->getGrid((uint)level);
+
+    std::vector<uint> iCellFccBorderVector;
+    std::vector<uint> iCellFccBulkVector;
+    std::vector<uint> iCellFcfBorderVector;
+    std::vector<uint> iCellFcfBulkVector;
+
+    // fill border and bulk vectors with iCellFCs
+    for (uint i = 0; i < para->getParH(level)->intFC.kFC; i++)
+        if (grid->isSparseIndexInFluidNodeIndicesBorder(iCellFccAll[i])) {
+            iCellFccBorderVector.push_back(iCellFccAll[i]);
+            iCellFcfBorderVector.push_back(iCellFcfAll[i]);
+        } else {
+            iCellFccBulkVector.push_back(iCellFccAll[i]);
+            iCellFcfBulkVector.push_back(iCellFcfAll[i]);
+        }
+
+    // set new sizes and pointers
+    para->getParH(level)->intFCBorder.ICellFCC = iCellFccAll;
+    para->getParH(level)->intFCBorder.ICellFCF = iCellFcfAll;
+    para->getParH(level)->intFCBorder.kFC      = (uint)iCellFccBorderVector.size();
+    para->getParH(level)->intFCBulk.kFC        = (uint)iCellFccBulkVector.size();
+    para->getParH(level)->intFCBulk.ICellFCC   = iCellFccAll + para->getParH(level)->intFCBorder.kFC;
+    para->getParH(level)->intFCBulk.ICellFCF   = iCellFcfAll + para->getParH(level)->intFCBorder.kFC;
+
+    // copy the created vectors to the memory addresses of the old arrays
+    // this is inefficient :(
+    for (uint i = 0; i < (uint)iCellFccBorderVector.size(); i++) {
+        iCellFccAll[i] = iCellFccBorderVector[i];
+        iCellFcfAll[i] = iCellFcfBorderVector[i];
+    }
+    for (uint i = 0; i < (uint)iCellFccBulkVector.size(); i++) {
+        para->getParH(level)->intFCBulk.ICellFCC[i] = iCellFccBulkVector[i];
+        para->getParH(level)->intFCBulk.ICellFCF[i] = iCellFcfBulkVector[i];
+    }
+}
+
+void IndexRearrangementForStreams::splitCoarseToFineIntoBorderAndBulk(const uint &level)
+{
+    this->getGridInterfaceIndicesBorderBulkCF(level);
+
+    para->getParD(level)->intCFBorder.kCF      = para->getParH(level)->intCFBorder.kCF;
+    para->getParD(level)->intCFBulk.kCF        = para->getParH(level)->intCFBulk.kCF;
+    para->getParD(level)->intCFBorder.ICellCFC = para->getParD(level)->intCF.ICellCFC;
+    para->getParD(level)->intCFBulk.ICellCFC =
+        para->getParD(level)->intCFBorder.ICellCFC + para->getParD(level)->intCFBorder.kCF;
+    para->getParD(level)->intCFBorder.ICellCFF = para->getParD(level)->intCF.ICellCFF;
+    para->getParD(level)->intCFBulk.ICellCFF =
+        para->getParD(level)->intCFBorder.ICellCFF + para->getParD(level)->intCFBorder.kCF;
+    para->getParD(level)->offCFBulk.xOffCF = para->getParD(level)->offCF.xOffCF + para->getParD(level)->intCFBorder.kCF;
+    para->getParD(level)->offCFBulk.yOffCF = para->getParD(level)->offCF.yOffCF + para->getParD(level)->intCFBorder.kCF;
+    para->getParD(level)->offCFBulk.zOffCF = para->getParD(level)->offCF.zOffCF + para->getParD(level)->intCFBorder.kCF;
+}
+
+void IndexRearrangementForStreams::getGridInterfaceIndicesBorderBulkCF(int level)
+{
+    // create some local variables for better readability
+    uint *iCellCfcAll  = para->getParH(level)->intCF.ICellCFC;
+    uint *iCellCffAll  = para->getParH(level)->intCF.ICellCFF;
+    uint *neighborX_SP = this->para->getParH(level)->neighborX_SP;
+    uint *neighborY_SP = this->para->getParH(level)->neighborY_SP;
+    uint *neighborZ_SP = this->para->getParH(level)->neighborZ_SP;
+    auto grid          = this->builder->getGrid((uint)level);
+
+    std::vector<uint> iCellCfcBorderVector;
+    std::vector<uint> iCellCfcBulkVector;
+    std::vector<uint> iCellCffBorderVector;
+    std::vector<uint> iCellCffBulkVector;
+    std::vector<real> xOffCFBorderVector;
+    std::vector<real> yOffCFBorderVector;
+    std::vector<real> zOffCFBorderVector;
+    std::vector<real> xOffCFBulkVector;
+    std::vector<real> yOffCFBulkVector;
+    std::vector<real> zOffCFBulkVector;
+    uint sparseIndexOfICellBSW;
+
+    // fill border and bulk vectors with iCellCFs
+    for (uint i = 0; i < para->getParH(level)->intCF.kCF; i++) {
+        sparseIndexOfICellBSW = iCellCfcAll[i];
+
+        if (grid->isSparseIndexInFluidNodeIndicesBorder(sparseIndexOfICellBSW) ||
+            grid->isSparseIndexInFluidNodeIndicesBorder(neighborX_SP[sparseIndexOfICellBSW]) ||
+            grid->isSparseIndexInFluidNodeIndicesBorder(neighborY_SP[sparseIndexOfICellBSW]) ||
+            grid->isSparseIndexInFluidNodeIndicesBorder(neighborZ_SP[sparseIndexOfICellBSW]) ||
+            grid->isSparseIndexInFluidNodeIndicesBorder(neighborY_SP[neighborX_SP[sparseIndexOfICellBSW]]) ||
+            grid->isSparseIndexInFluidNodeIndicesBorder(neighborZ_SP[neighborX_SP[sparseIndexOfICellBSW]]) ||
+            grid->isSparseIndexInFluidNodeIndicesBorder(neighborZ_SP[neighborY_SP[sparseIndexOfICellBSW]]) ||
+            grid->isSparseIndexInFluidNodeIndicesBorder(
+                neighborZ_SP[neighborY_SP[neighborX_SP[sparseIndexOfICellBSW]]])) {
+
+            iCellCfcBorderVector.push_back(iCellCfcAll[i]);
+            iCellCffBorderVector.push_back(iCellCffAll[i]);
+            xOffCFBorderVector.push_back(para->getParH(level)->offCF.xOffCF[i]);
+            yOffCFBorderVector.push_back(para->getParH(level)->offCF.yOffCF[i]);
+            zOffCFBorderVector.push_back(para->getParH(level)->offCF.zOffCF[i]);
+        } else {
+            iCellCfcBulkVector.push_back(iCellCfcAll[i]);
+            iCellCffBulkVector.push_back(iCellCffAll[i]);
+            xOffCFBulkVector.push_back(para->getParH(level)->offCF.xOffCF[i]);
+            yOffCFBulkVector.push_back(para->getParH(level)->offCF.yOffCF[i]);
+            zOffCFBulkVector.push_back(para->getParH(level)->offCF.zOffCF[i]);
+        }
+    }
+
+    // set new sizes and pointers
+    para->getParH(level)->intCFBorder.ICellCFC = para->getParH(level)->intCF.ICellCFC;
+    para->getParH(level)->intCFBorder.ICellCFF = para->getParH(level)->intCF.ICellCFF;
+    para->getParH(level)->intCFBorder.kCF      = (uint)iCellCfcBorderVector.size();
+    para->getParH(level)->intCFBulk.kCF        = (uint)iCellCfcBulkVector.size();
+    para->getParH(level)->intCFBulk.ICellCFC =
+        para->getParH(level)->intCF.ICellCFC + para->getParH(level)->intCFBorder.kCF;
+    para->getParH(level)->intCFBulk.ICellCFF =
+        para->getParH(level)->intCF.ICellCFF + para->getParH(level)->intCFBorder.kCF;
+    para->getParH(level)->offCFBulk.xOffCF = para->getParH(level)->offCF.xOffCF + para->getParH(level)->intCFBorder.kCF;
+    para->getParH(level)->offCFBulk.yOffCF = para->getParH(level)->offCF.yOffCF + para->getParH(level)->intCFBorder.kCF;
+    para->getParH(level)->offCFBulk.zOffCF = para->getParH(level)->offCF.zOffCF + para->getParH(level)->intCFBorder.kCF;
+
+    // copy the created vectors to the memory addresses of the old arrays
+    // this is inefficient :(
+    for (uint i = 0; i < (uint)iCellCfcBorderVector.size(); i++) {
+        para->getParH(level)->intCFBorder.ICellCFC[i] = iCellCfcBorderVector[i];
+        para->getParH(level)->intCFBorder.ICellCFF[i] = iCellCffBorderVector[i];
+        para->getParH(level)->offCF.xOffCF[i]         = xOffCFBorderVector[i];
+        para->getParH(level)->offCF.yOffCF[i]         = yOffCFBorderVector[i];
+        para->getParH(level)->offCF.zOffCF[i]         = zOffCFBorderVector[i];
+    }
+    for (uint i = 0; i < (uint)iCellCfcBulkVector.size(); i++) {
+        para->getParH(level)->intCFBulk.ICellCFC[i] = iCellCfcBulkVector[i];
+        para->getParH(level)->intCFBulk.ICellCFF[i] = iCellCffBulkVector[i];
+        para->getParH(level)->offCFBulk.xOffCF[i]   = xOffCFBulkVector[i];
+        para->getParH(level)->offCFBulk.yOffCF[i]   = yOffCFBulkVector[i];
+        para->getParH(level)->offCFBulk.zOffCF[i]   = zOffCFBulkVector[i];
+    }
+}
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.h b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5b1ada0e475f4c8a3b214772dc290a3c1a40699
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.h
@@ -0,0 +1,164 @@
+//! \file IndexRearrangementForStreams.h
+//! \ingroup GPU
+//! \author Anna Wellmann
+//! \ref master thesis of Anna Wellmann
+
+#ifndef IndexRearrangementForStreams_H
+#define IndexRearrangementForStreams_H
+
+#include <gpu/VirtualFluids_GPU/DataStructureInitializer/GridProvider.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "LBM/LB.h"
+
+class Parameter;
+class GridBuilder;
+namespace vf::gpu
+{
+class Communicator;
+}
+
+class IndexRearrangementForStreams
+{
+public:
+    //! \brief construct IndexRearrangementForStreams object
+    IndexRearrangementForStreams(std::shared_ptr<Parameter> para, std::shared_ptr<GridBuilder> builder, vf::gpu::Communicator& communicator);
+
+    //////////////////////////////////////////////////////////////////////////
+    // communication after coarse to fine
+    //////////////////////////////////////////////////////////////////////////
+
+    //! \brief initialize the arrays for the communication after the interpolation from fine to coarse in x direction
+    //! \details Only the nodes involved in the interpolation need to be exchanged. Therefore in this method all nodes,
+    //! which are part of the interpolation as well as the communication, are identified.
+    //!
+    //! \ref see master thesis of Anna
+    //! Wellmann (p. 59-62: "Reduzieren der auszutauschenden Knoten")
+    void initCommunicationArraysForCommAfterFinetoCoarseX(const uint &level, int j, int direction);
+    //! \brief initialize the arrays for the communication after the interpolation from fine to coarse in y direction
+    //! \details --> see x direction
+    void initCommunicationArraysForCommAfterFinetoCoarseY(const uint &level, int j, int direction);
+    //! \brief initialize the arrays for the communication after the interpolation from fine to coarse in z direction
+    //! \details --> see x direction
+    void initCommunicationArraysForCommAfterFinetoCoarseZ(const uint &level, int j, int direction);
+
+public:
+    //////////////////////////////////////////////////////////////////////////
+    // split interpolation cells
+    //////////////////////////////////////////////////////////////////////////
+
+    //! \brief split the interpolation cells from coarse to fine into border an bulk
+    //! \details For communication hiding, the interpolation cells from the coarse to the fine grid need to be split
+    //! into two groups:
+    //!
+    //! - cells which are at the border between two gpus --> "border"
+    //!
+    //! - the other cells which are not directly related to the communication between the two gpus --> "bulk"
+    //!
+    //! \ref see master thesis of Anna Wellmann (p. 62-68: "Ãœberdeckung der reduzierten Kommunikation")
+    void splitCoarseToFineIntoBorderAndBulk(const uint &level);
+
+    //! \brief split the interpolation cells from fine to coarse into border an bulk
+    //! \details For communication hiding, the interpolation cells from the fine to the coarse grid need to be split
+    //! into two groups:
+    //!
+    //! - cells which are at the border between two gpus --> "border"
+    //!
+    //! - the other cells which are not directly related to the communication between the two gpus --> "bulk"
+    //!
+    //! \ref see master thesis of Anna Wellmann (p. 62-68: "Ãœberdeckung der reduzierten Kommunikation")
+    void splitFineToCoarseIntoBorderAndBulk(const uint &level);
+
+private:
+    //////////////////////////////////////////////////////////////////////////
+    // communication after coarse to fine
+    //////////////////////////////////////////////////////////////////////////
+
+    //! \brief inits pointers for reduced communication after interpolation fine to coarse by copying them from "normal"
+    //! communication
+    void copyProcessNeighborToCommAfterFtoCX(const uint &level, int indexOfProcessNeighbor);
+    void copyProcessNeighborToCommAfterFtoCY(const uint &level, int indexOfProcessNeighbor);
+    void copyProcessNeighborToCommAfterFtoCZ(const uint &level, int indexOfProcessNeighbor);
+
+    void reorderSendIndicesForCommAfterFtoCX(int direction, int level, int indexOfProcessNeighbor,
+                                             std::vector<uint> &sendIndicesForCommAfterFtoCPositions);
+    void reorderSendIndicesForCommAfterFtoCY(int direction, int level, int indexOfProcessNeighbor,
+                                             std::vector<uint> &sendIndicesForCommAfterFtoCPositions);
+    void reorderSendIndicesForCommAfterFtoCZ(int direction, int level, int indexOfProcessNeighbor,
+                                             std::vector<uint> &sendIndicesForCommAfterFtoCPositions);
+
+    //! \brief the send indices are reordered for the communication after the interpolation from fine to coarse
+    //! \details The indices of nodes which are part of the interpolation are moved to the front of vector with the send
+    //! indices. 
+    //! \pre para->getParH(level)->intCF needs to be inititalized 
+    //! \param sendIndices is the pointer to the vector with the send indices, which will be reordered in this function
+    //! \param numberOfSendNodesAfterFtoC will be set in this method 
+    //! \param sendIndicesForCommAfterFtoCPositions stores each sendIndex's positions before reordering
+    void reorderSendIndicesForCommAfterFtoC(int *sendIndices, int &numberOfSendNodesAfterFtoC, int direction,
+                                            int level, std::vector<uint> &sendIndicesForCommAfterFtoCPositions);
+    //! \brief check if a sparse index occurs in the ICellFCC
+    bool isSparseIndexInICellFCC(uint sizeOfICellFCC, int sparseIndexSend, int level);
+    //! \brief aggregate all nodes in the coarse cells for the interpolation in coarse to fine
+    //! \details For the coarse cells in the interpolation from coarse to fine only one node is stored. This methods
+    //! looks for the other nodes of each cell and puts them into vector. Duplicate nodes are only stored once.
+    void aggregateNodesInICellCFC(int level, std::vector<uint> &nodesCFC);
+    //! \brief add index to sendIndicesAfterFtoC and sendIndicesForCommAfterFtoCPositions, but omit indices which are already in sendIndicesAfterFtoC
+    void addUniqueIndexToCommunicationVectors(std::vector<int> &sendIndicesAfterFtoC, int &sparseIndexSend,
+                                              std::vector<unsigned int> &sendIndicesForCommAfterFtoCPositions,
+                                              uint &posInSendIndices) const;
+    //! \brief find if a sparse index is a send index. If true, call addUniqueIndexToCommunicationVectors()
+    void
+    findIfSparseIndexIsInSendIndicesAndAddToCommVectors(int sparseIndex, int *sendIndices, uint numberOfSendIndices,
+                                                        std::vector<int> &sendIndicesAfterFtoC,
+                                                        std::vector<uint> &sendIndicesForCommAfterFtoCPositions) const;
+    //! \brief find all indices which are not part of the communication after the interpolation from fine to coarse
+    void findIndicesNotInCommAfterFtoC(const uint &numberOfSendOrRecvIndices, int *sendOrReceiveIndices,
+                                       std::vector<int> &sendOrReceiveIndicesAfterFtoC,
+                                       std::vector<int> &sendOrIndicesOther);
+
+    void reorderRecvIndicesForCommAfterFtoCX(int direction, int level, int indexOfProcessNeighbor,
+                                             std::vector<uint> &sendIndicesForCommAfterFtoCPositions);
+    void reorderRecvIndicesForCommAfterFtoCY(int direction, int level, int indexOfProcessNeighbor,
+                                             std::vector<uint> &sendIndicesForCommAfterFtoCPositions);
+    void reorderRecvIndicesForCommAfterFtoCZ(int direction, int level, int indexOfProcessNeighbor,
+                                             std::vector<uint> &sendIndicesForCommAfterFtoCPositions);
+                                             
+    //! \brief reorder the receive indices in the same way that the send indices were reordered.
+    //! \details When the send indices are reordered, the receive indices need to be reordered accordingly.
+    //! \pre sendIndicesForCommAfterFtoCPositions should not be empty
+    //! \param recvIndices is the pointer to the vector with the receive indices, which will be reordered in this function
+    //! \param numberOfRecvNodesAfterFtoC will be set in this function
+    //! \param sendIndicesForCommAfterFtoCPositions stores each sendIndex's positions before reordering and is used to reorder the receive indices in the same way
+    void reorderRecvIndicesForCommAfterFtoC(int *recvIndices, int &numberOfRecvNodesAfterFtoC, int direction,
+                                            int level, std::vector<uint> &sendIndicesForCommAfterFtoCPositions);
+
+private:
+    //////////////////////////////////////////////////////////////////////////
+    // split interpolation cells
+    //////////////////////////////////////////////////////////////////////////
+
+    //! \brief This function reorders the arrays of CFC/CFF indices and sets the pointers and sizes of the new
+    //! subarrays: \details The coarse cells for interpolation from coarse to fine (iCellCFC) are divided into two
+    //! subgroups: border and bulk. The fine cells (iCellCFF) are reordered accordingly. The offset cells (xOffCF,
+    //! yOffCF, zOffCF) must be reordered in the same way.
+    void getGridInterfaceIndicesBorderBulkCF(int level);
+
+    //! \brief This function reorders the arrays of FCC/FCF indices and return pointers and sizes of the new subarrays:
+    //! \details The coarse cells for interpolation from fine to coarse (iCellFCC) are divided into two subgroups:
+    //! border and bulk. The fine cells (iCellFCF) are reordered accordingly.
+    void getGridInterfaceIndicesBorderBulkFC(int level);
+
+
+private:
+    std::shared_ptr<GridBuilder> builder;
+    std::shared_ptr<Parameter> para;
+    vf::gpu::Communicator& communicator;
+
+    // used for tests
+    friend class IndexRearrangementForStreamsTest_reorderSendIndices;
+};
+
+#endif
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreamsTest.cfg b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreamsTest.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..e414d4f3173e555b8944fa9637ebbd2023ce393c
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreamsTest.cfg
@@ -0,0 +1,3 @@
+# these two parameters need to be defined in each config file
+Path = /output/path
+GridPath = /path/to/grid
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreamsTest.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreamsTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e238b624b057a0650116a940132115a67d9aab2
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreamsTest.cpp
@@ -0,0 +1,343 @@
+#include <gmock/gmock.h>
+
+#include <algorithm>
+#include <filesystem>
+#include <iostream>
+#include <mpi.h>
+
+#include "Parameter/Parameter.h"
+#include "basics/config/ConfigurationFile.h"
+#include "DataStructureInitializer/GridReaderGenerator/IndexRearrangementForStreams.h"
+#include "gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
+#include "gpu/GridGenerator/grid/GridImp.h"
+#include "gpu/GridGenerator/utilities/communication.h"
+#include "gpu/VirtualFluids_GPU/Communication/Communicator.cpp"
+
+template <typename T>
+bool vectorsAreEqual(T *vector1, std::vector<T> vectorExpected)
+{
+    for (uint i = 0; i < vectorExpected.size(); i++) {
+        if (vector1[i] != vectorExpected[i])
+            return false;
+    }
+    return true;
+}
+
+class LevelGridBuilderDouble : public LevelGridBuilder
+{
+private:
+    SPtr<Grid> grid;
+    LevelGridBuilderDouble() = default;
+
+    uint numberOfSendIndices;
+
+public:
+    LevelGridBuilderDouble(SPtr<Grid> grid) : LevelGridBuilder(), grid(grid){};
+    SPtr<Grid> getGrid(uint level) override { return grid; };
+    std::shared_ptr<Grid> getGrid(int level, int box) override { return grid; };
+    void setNumberOfSendIndices(uint numberOfSendIndices) { this->numberOfSendIndices = numberOfSendIndices; };
+    uint getNumberOfSendIndices(int direction, uint level) override { return numberOfSendIndices; };
+};
+
+class GridImpDouble : public GridImp
+{
+private:
+    std::vector<uint> fluidNodeIndicesBorder;
+
+public:
+    GridImpDouble(Object *object, real startX, real startY, real startZ, real endX, real endY, real endZ, real delta,
+                  Distribution d, uint level)
+        : GridImp(object, startX, startY, startZ, endX, endY, endZ, delta, d, level)
+    {
+    }
+
+    static SPtr<GridImpDouble> makeShared(Object *object, real startX, real startY, real startZ, real endX, real endY,
+                                          real endZ, real delta, Distribution d,
+                                          uint level)
+    {
+        SPtr<GridImpDouble> grid(
+            new GridImpDouble(object, startX, startY, startZ, endX, endY, endZ, delta, d, level));
+        return grid;
+    }
+
+    void setFluidNodeIndicesBorder(std::vector<uint> fluidNodeIndicesBorder)
+    {
+        this->fluidNodeIndicesBorder = fluidNodeIndicesBorder;
+    }
+
+    bool isSparseIndexInFluidNodeIndicesBorder(uint &sparseIndex) const override
+    {
+        return std::find(this->fluidNodeIndicesBorder.begin(), this->fluidNodeIndicesBorder.end(), sparseIndex) !=
+               this->fluidNodeIndicesBorder.end();
+    }
+};
+
+struct CFBorderBulk {
+    // data to work on
+    std::vector<uint> fluidNodeIndicesBorder = { 10, 11, 12, 13, 14, 15, 16 };
+    std::vector<uint> iCellCFC               = { 1, 11, 3, 13, 5, 15, 7 };
+    std::vector<uint> iCellCFF               = { 2, 12, 4, 14, 6, 16, 8 };
+    uint sizeOfICellCf                       = (uint)iCellCFC.size();
+    uint neighborX_SP[17]                    = { 0u };
+    uint neighborY_SP[17]                    = { 0u };
+    uint neighborZ_SP[17]                    = { 0u };
+    int level                                = 0;
+    std::vector<real> offsetCFx              = { 1, 11, 3, 13, 5, 15, 7 };
+    std::vector<real> offsetCFy              = { 101, 111, 103, 113, 105, 115, 107 };
+    std::vector<real> offsetCFz              = { 1001, 1011, 1003, 1013, 1005, 1015, 1007 };
+
+    // expected data
+    std::vector<uint> iCellCfcBorder_expected   = { 11, 13, 15 };
+    std::vector<uint> iCellCfcBulk_expected     = { 1, 3, 5, 7 };
+    std::vector<uint> iCellCffBorder_expected   = { 12, 14, 16 };
+    std::vector<uint> iCellCffBulk_expected     = { 2, 4, 6, 8 };
+    std::vector<real> offsetCFx_Border_expected = { 11, 13, 15 };
+    std::vector<real> offsetCFx_Bulk_expected   = { 1, 3, 5, 7 };
+    std::vector<real> offsetCFy_Border_expected = { 111, 113, 115 };
+    std::vector<real> offsetCFy_Bulk_expected   = { 101, 103, 105, 107 };
+    std::vector<real> offsetCFz_Border_expected = { 1011, 1013, 1015 };
+    std::vector<real> offsetCFz_Bulk_expected   = { 1001, 1003, 1005, 1007 };
+};
+
+static SPtr<Parameter> initParameterClass()
+{
+    std::filesystem::path filePath = __FILE__; //  assuming that the config file is stored parallel to this file.
+    filePath.replace_filename("IndexRearrangementForStreamsTest.cfg");
+    vf::basics::ConfigurationFile config;
+    config.load(filePath.string());
+    return std::make_shared<Parameter>(config, 1, 0);
+}
+
+class IndexRearrangementForStreamsTest_IndicesCFBorderBulkTest : public testing::Test
+{
+protected:
+    CFBorderBulk cf;
+    SPtr<Parameter> para;
+    std::unique_ptr<IndexRearrangementForStreams> testSubject;
+
+private:
+    std::unique_ptr<IndexRearrangementForStreams> createTestSubjectCFBorderBulk()
+    {
+        SPtr<GridImpDouble> grid =
+            GridImpDouble::makeShared(nullptr, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, Distribution(), 1);
+        grid->setFluidNodeIndicesBorder(cf.fluidNodeIndicesBorder);
+        std::shared_ptr<LevelGridBuilderDouble> builder = std::make_shared<LevelGridBuilderDouble>(grid);
+
+        para->setMaxLevel(cf.level + 1); // setMaxLevel resizes parH and parD
+        para->parH[cf.level]                    = std::make_shared<LBMSimulationParameter>();
+        para->parD[cf.level]                    = std::make_shared<LBMSimulationParameter>();
+        para->getParH(cf.level)->intCF.ICellCFC = &(cf.iCellCFC.front());
+        para->getParH(cf.level)->intCF.ICellCFF = &(cf.iCellCFF.front());
+        para->getParH(cf.level)->neighborX_SP   = cf.neighborX_SP;
+        para->getParH(cf.level)->neighborY_SP   = cf.neighborY_SP;
+        para->getParH(cf.level)->neighborZ_SP   = cf.neighborZ_SP;
+        para->getParH(cf.level)->intCF.kCF      = cf.sizeOfICellCf;
+        para->getParH(cf.level)->offCF.xOffCF   = &(cf.offsetCFx.front());
+        para->getParH(cf.level)->offCF.yOffCF   = &(cf.offsetCFy.front());
+        para->getParH(cf.level)->offCF.zOffCF   = &(cf.offsetCFz.front());
+
+        return std::make_unique<IndexRearrangementForStreams>(para, builder, vf::gpu::Communicator::getInstance());
+    };
+
+    void SetUp() override
+    {
+        para        = initParameterClass();
+        testSubject = createTestSubjectCFBorderBulk();
+    }
+};
+
+TEST_F(IndexRearrangementForStreamsTest_IndicesCFBorderBulkTest, splitCoarseToFineIntoBorderAndBulk)
+{
+    testSubject->splitCoarseToFineIntoBorderAndBulk(cf.level);
+
+    EXPECT_THAT(para->getParH(cf.level)->intCFBorder.kCF + para->getParH(cf.level)->intCFBulk.kCF,
+                testing::Eq(cf.sizeOfICellCf))
+        << "The number of interpolation cells from coarse to fine changed during reordering.";
+
+    // check coarse to fine border (coarse nodes)
+    EXPECT_THAT(para->getParH(cf.level)->intCFBorder.kCF, testing::Eq((uint)cf.iCellCfcBorder_expected.size()));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(cf.level)->intCFBorder.ICellCFC, cf.iCellCfcBorder_expected))
+        << "intCFBorder.ICellCFC does not match the expected border vector";
+    // check coarse to fine border (fine nodes)
+    EXPECT_THAT(para->getParH(cf.level)->intCFBorder.kCF, testing::Eq((uint)cf.iCellCffBorder_expected.size()));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(cf.level)->intCFBorder.ICellCFF, cf.iCellCffBorder_expected))
+        << "intCFBorder.ICellCFF does not match the expected border vector";
+
+    // check coarse to fine bulk (coarse nodes)
+    EXPECT_THAT(para->getParH(cf.level)->intCFBulk.kCF, testing::Eq((uint)cf.iCellCfcBulk_expected.size()));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(cf.level)->intCFBulk.ICellCFC, cf.iCellCfcBulk_expected))
+        << "intCFBulk.ICellCFC does not match the expected bulk vector";
+    // check coarse to fine bulk (fine nodes)
+    EXPECT_THAT(para->getParH(cf.level)->intCFBulk.kCF, testing::Eq((uint)cf.iCellCffBulk_expected.size()));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(cf.level)->intCFBulk.ICellCFF, cf.iCellCffBulk_expected))
+        << "intCFBulk.ICellCFF does not match the expected bulk vector";
+
+    // check offset cells
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(cf.level)->offCF.xOffCF, cf.offsetCFx_Border_expected));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(cf.level)->offCFBulk.xOffCF, cf.offsetCFx_Bulk_expected));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(cf.level)->offCF.yOffCF, cf.offsetCFy_Border_expected));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(cf.level)->offCFBulk.yOffCF, cf.offsetCFy_Bulk_expected));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(cf.level)->offCF.zOffCF, cf.offsetCFz_Border_expected));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(cf.level)->offCFBulk.zOffCF, cf.offsetCFz_Bulk_expected));
+}
+
+struct FCBorderBulk {
+    // data to work on
+    std::vector<uint> fluidNodeIndicesBorder = { 110, 111, 112, 113, 114, 115, 116 };
+    std::vector<uint> iCellFCC               = { 11, 111, 13, 113, 15, 115, 17 };
+    std::vector<uint> iCellFCF               = { 12, 112, 14, 114, 16, 116, 18 };
+    uint sizeOfICellFC                       = (uint)iCellFCC.size();
+    int level                                = 1;
+
+    // expected data
+    std::vector<uint> iCellFccBorder_expected = { 111, 113, 115 };
+    std::vector<uint> iCellFccBulk_expected   = { 11, 13, 15, 17 };
+    std::vector<uint> iCellFcfBorder_expected = { 112, 114, 116 };
+    std::vector<uint> iCellFcfBulk_expected   = { 12, 14, 16, 18 };
+};
+
+class IndexRearrangementForStreamsTest_IndicesFCBorderBulkTest : public testing::Test
+{
+protected:
+    FCBorderBulk fc;
+    SPtr<Parameter> para;
+    std::unique_ptr<IndexRearrangementForStreams> testSubject;
+
+private:
+    std::unique_ptr<IndexRearrangementForStreams> createTestSubjectFCBorderBulk()
+    {
+        SPtr<GridImpDouble> grid =
+            GridImpDouble::makeShared(nullptr, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, Distribution(), 1);
+        grid->setFluidNodeIndicesBorder(fc.fluidNodeIndicesBorder);
+        std::shared_ptr<LevelGridBuilderDouble> builder = std::make_shared<LevelGridBuilderDouble>(grid);
+
+        para->setMaxLevel(fc.level + 1); // setMaxLevel resizes parH and parD
+        para->parH[fc.level]                    = std::make_shared<LBMSimulationParameter>();
+        para->parD[fc.level]                    = std::make_shared<LBMSimulationParameter>();
+        para->getParH(fc.level)->intFC.ICellFCC = &(fc.iCellFCC.front());
+        para->getParH(fc.level)->intFC.ICellFCF = &(fc.iCellFCF.front());
+        para->getParH(fc.level)->intFC.kFC      = fc.sizeOfICellFC;
+
+        return std::make_unique<IndexRearrangementForStreams>(para, builder, vf::gpu::Communicator::getInstance());
+    };
+
+    void SetUp() override
+    {
+        para        = initParameterClass();
+        testSubject = createTestSubjectFCBorderBulk();
+    }
+};
+
+TEST_F(IndexRearrangementForStreamsTest_IndicesFCBorderBulkTest, splitFineToCoarseIntoBorderAndBulk)
+{
+    testSubject->splitFineToCoarseIntoBorderAndBulk(fc.level);
+
+    EXPECT_THAT(para->getParH(fc.level)->intFCBorder.kFC + para->getParH(fc.level)->intFCBulk.kFC,
+                testing::Eq(fc.sizeOfICellFC))
+        << "The number of interpolation cells from coarse to fine changed during reordering.";
+
+    // check coarse to fine border (coarse nodes)
+    EXPECT_THAT(para->getParH(fc.level)->intFCBorder.kFC, testing::Eq((uint)fc.iCellFccBorder_expected.size()));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(fc.level)->intFCBorder.ICellFCC, fc.iCellFccBorder_expected))
+        << "intFCBorder.ICellFCC does not match the expected border vector";
+    // check coarse to fine border (fine nodes)
+    EXPECT_THAT(para->getParH(fc.level)->intFCBorder.kFC, testing::Eq((uint)fc.iCellFcfBorder_expected.size()));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(fc.level)->intFCBorder.ICellFCF, fc.iCellFcfBorder_expected))
+        << "intFCBorder.ICellFCF does not match the expected border vector";
+
+    // check coarse to fine bulk (coarse nodes)
+    EXPECT_THAT(para->getParH(fc.level)->intFCBulk.kFC, testing::Eq((uint)fc.iCellFccBulk_expected.size()));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(fc.level)->intFCBulk.ICellFCC, fc.iCellFccBulk_expected))
+        << "intFCBulk.ICellFCC does not match the expected bulk vector";
+    // check coarse to fine bulk (fine nodes)
+    EXPECT_THAT(para->getParH(fc.level)->intFCBulk.kFC, testing::Eq((uint)fc.iCellFcfBulk_expected.size()));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(fc.level)->intFCBulk.ICellFCF, fc.iCellFcfBulk_expected))
+        << "intFCBulk.ICellFCF does not match the expected bulk vector";
+}
+struct SendIndicesForCommAfterFtoCX {
+    // data to work on
+    std::vector<int> sendIndices = { 10, 11, 12, 13, 14, 15, 16 };
+    int level                    = 0;
+    int direction                = CommunicationDirections::MX;
+    int numberOfProcessNeighbors = 1;
+    int indexOfProcessNeighbor   = 0;
+
+    std::vector<uint> iCellCFC = { 8, 10, 12 };
+    std::vector<uint> iCellFCC = { 14, 16, 18 };
+    uint kCF                   = (uint)iCellCFC.size();
+    uint kFC                   = (uint)iCellFCC.size();
+    uint neighborX_SP[18]      = { 0u };
+    uint neighborY_SP[18]      = { 0u };
+    uint neighborZ_SP[18]      = { 0u };
+
+    // output data
+    std::vector<uint> sendIndicesForCommAfterFtoCPositions;
+
+    // expected data
+    std::vector<uint> sendIndicesForCommAfterFtoCPositions_expected = { 4, 6, 0, 2 };
+    std::vector<int> sendProcessNeighborX_expected                  = { 14, 16, 10, 12, 11, 13, 15 };
+    int numberOfSendNodesAfterFtoC_expected = (int)sendIndicesForCommAfterFtoCPositions_expected.size();
+};
+
+class IndexRearrangementForStreamsTest_reorderSendIndices : public testing::Test
+{
+protected:
+    SendIndicesForCommAfterFtoCX si;
+    SPtr<Parameter> para;
+    std::unique_ptr<IndexRearrangementForStreams> testSubject;
+
+    void act()
+    {
+        testSubject->reorderSendIndicesForCommAfterFtoCX(si.direction, si.level, si.indexOfProcessNeighbor,
+                                                         si.sendIndicesForCommAfterFtoCPositions);
+    };
+private:
+    std::unique_ptr<IndexRearrangementForStreams> createTestSubjectReorderSendIndices()
+    {
+        logging::Logger::addStream(&std::cout);
+
+        SPtr<GridImpDouble> grid =
+            GridImpDouble::makeShared(nullptr, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, Distribution(), 1);
+        std::shared_ptr<LevelGridBuilderDouble> builder = std::make_shared<LevelGridBuilderDouble>(grid);
+
+        builder->setNumberOfSendIndices((uint)si.sendIndices.size());
+        para->setMaxLevel(si.level + 1); // setMaxLevel resizes parH and parD
+        para->parH[si.level] = std::make_shared<LBMSimulationParameter>();
+        para->parD[si.level] = std::make_shared<LBMSimulationParameter>();
+
+        para->getParH(si.level)->intFC.kFC      = si.kFC;
+        para->getParH(si.level)->intFC.ICellFCC = &(si.iCellFCC.front());
+        para->getParH(si.level)->intCF.ICellCFC = &(si.iCellCFC.front());
+        para->getParH(si.level)->intCF.kCF      = si.kCF;
+        para->getParH(si.level)->neighborX_SP   = si.neighborX_SP;
+        para->getParH(si.level)->neighborY_SP   = si.neighborY_SP;
+        para->getParH(si.level)->neighborZ_SP   = si.neighborZ_SP;
+
+        para->setNumberOfProcessNeighborsX(si.numberOfProcessNeighbors, si.level, "send");
+        para->getParH(si.level)->sendProcessNeighborX[si.indexOfProcessNeighbor].index = si.sendIndices.data();
+        para->initProcessNeighborsAfterFtoCX(si.level);
+
+        return std::make_unique<IndexRearrangementForStreams>(IndexRearrangementForStreams(para, builder, vf::gpu::Communicator::getInstance()));
+    };
+
+    void SetUp() override
+    {
+        para        = initParameterClass();
+        testSubject = createTestSubjectReorderSendIndices();
+    };
+};
+
+TEST_F(IndexRearrangementForStreamsTest_reorderSendIndices, reorderSendIndicesForCommAfterFtoCX)
+{
+    act();
+
+    EXPECT_THAT(si.sendIndicesForCommAfterFtoCPositions.size(),
+                testing::Eq(si.sendIndicesForCommAfterFtoCPositions_expected.size()));
+    EXPECT_THAT(si.sendIndicesForCommAfterFtoCPositions, testing::Eq(si.sendIndicesForCommAfterFtoCPositions_expected));
+
+    EXPECT_THAT(para->getParH(si.level)->sendProcessNeighborsAfterFtoCX[si.indexOfProcessNeighbor].numberOfNodes,
+                testing::Eq(si.numberOfSendNodesAfterFtoC_expected));
+    EXPECT_TRUE(vectorsAreEqual(para->getParH(si.level)->sendProcessNeighborX[si.indexOfProcessNeighbor].index,
+                                si.sendProcessNeighborX_expected))
+        << "sendProcessNeighborX[].index does not match the expected vector";
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
index 4e8eb124731cffb54a51018fa6f06da45f671c73..6efd3e4b1f91051e9609783298b56491abed731d 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
+++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
@@ -5,12 +5,13 @@
 #include <math.h>
 
 #include <Parameter/Parameter.h>
-#include <PreCollisionInteractor/ActuatorLine.h>
-#include <PreCollisionInteractor/Probes/Probe.h>
+#include "Parameter/CudaStreamManager.h"
+#include "PreCollisionInteractor/ActuatorLine.h"
+#include "PreCollisionInteractor/Probes/Probe.h"
 
 #include "Calculation/PorousMedia.h"
 
-#include <lbm/constants/NumericConstants.h>
+#include "lbm/constants/NumericConstants.h"
 
 void CudaMemoryManager::cudaAllocFull(int lev)
 {
@@ -510,19 +511,35 @@ void CudaMemoryManager::cudaCopyProcessNeighborXIndex(int lev, unsigned int proc
 								parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].memsizeIndex, 
 								cudaMemcpyHostToDevice));
 }
-void CudaMemoryManager::cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor)
-{
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].f[0], 
-								parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0], 
-								parameter->getD3Qxx() * parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].memsizeFs, 
-								cudaMemcpyHostToDevice));
-}
-void CudaMemoryManager::cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor)
-{
-	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborX[processNeighbor].f[0], 
-								parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].f[0], 
-								parameter->getD3Qxx() * parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].memsizeFs, 
-								cudaMemcpyDeviceToHost));
+void CudaMemoryManager::cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor,
+                                                     const unsigned int &memsizeFsRecv, int streamIndex)
+{
+    if (streamIndex == -1)
+        checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].f[0], 
+						 parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0], 
+						 parameter->getD3Qxx() * memsizeFsRecv, 
+						 cudaMemcpyHostToDevice));
+    else
+        checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborX[processNeighbor].f[0],
+                         parameter->getParH(lev)->recvProcessNeighborX[processNeighbor].f[0],
+                         parameter->getD3Qxx() * memsizeFsRecv,
+                         cudaMemcpyHostToDevice,
+                         parameter->getStreamManager()->getStream(streamIndex)));
+}
+void CudaMemoryManager::cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor,
+                                                     const unsigned int &memsizeFsSend, int streamIndex)
+{
+    if (streamIndex == -1)
+    	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborX[processNeighbor].f[0], 
+    								parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].f[0], 
+    								parameter->getD3Qxx() * memsizeFsSend, 
+    								cudaMemcpyDeviceToHost));
+    else
+        checkCudaErrors( cudaMemcpyAsync(parameter->getParH(lev)->sendProcessNeighborX[processNeighbor].f[0], 
+    								     parameter->getParD(lev)->sendProcessNeighborX[processNeighbor].f[0], 
+    								     parameter->getD3Qxx() * memsizeFsSend, 
+    								     cudaMemcpyDeviceToHost,
+                                         parameter->getStreamManager()->getStream(streamIndex)));
 }
 void CudaMemoryManager::cudaFreeProcessNeighborX(int lev, unsigned int processNeighbor)
 {
@@ -565,19 +582,35 @@ void CudaMemoryManager::cudaCopyProcessNeighborYIndex(int lev, unsigned int proc
 								parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].memsizeIndex, 
 								cudaMemcpyHostToDevice));
 }
-void CudaMemoryManager::cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor)
-{
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0], 
-								parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0], 
-								parameter->getD3Qxx() * parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].memsizeFs, 
-								cudaMemcpyHostToDevice));
-}
-void CudaMemoryManager::cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor)
-{
-	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0], 
-								parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0], 
-								parameter->getD3Qxx() * parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].memsizeFs, 
-								cudaMemcpyDeviceToHost));
+void CudaMemoryManager::cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv,
+                                                     int streamIndex)
+{
+    if (streamIndex == -1)
+	    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0], 
+								    parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0], 
+								    parameter->getD3Qxx() * memsizeFsRecv, 
+								    cudaMemcpyHostToDevice));
+    else
+        checkCudaErrors(cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborY[processNeighbor].f[0],
+                        parameter->getParH(lev)->recvProcessNeighborY[processNeighbor].f[0],
+                        parameter->getD3Qxx() * memsizeFsRecv,
+                        cudaMemcpyHostToDevice, 
+                        parameter->getStreamManager()->getStream(streamIndex)));
+}
+void CudaMemoryManager::cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend,
+                                                     int streamIndex)
+{
+    if (streamIndex == -1)
+	    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0], 
+	    							parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0], 
+	    							parameter->getD3Qxx() * memsizeFsSend, 
+	    							cudaMemcpyDeviceToHost));
+    else
+        checkCudaErrors(
+            cudaMemcpyAsync(parameter->getParH(lev)->sendProcessNeighborY[processNeighbor].f[0],
+                            parameter->getParD(lev)->sendProcessNeighborY[processNeighbor].f[0],
+                            parameter->getD3Qxx() * memsizeFsSend,
+                            cudaMemcpyDeviceToHost, parameter->getStreamManager()->getStream(streamIndex)));
 }
 void CudaMemoryManager::cudaFreeProcessNeighborY(int lev, unsigned int processNeighbor)
 {
@@ -620,19 +653,35 @@ void CudaMemoryManager::cudaCopyProcessNeighborZIndex(int lev, unsigned int proc
 								parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].memsizeIndex, 
 								cudaMemcpyHostToDevice));
 }
-void CudaMemoryManager::cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor)
-{
-	checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].f[0], 
-								parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].f[0], 
-								parameter->getD3Qxx() * parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].memsizeFs, 
-								cudaMemcpyHostToDevice));
-}
-void CudaMemoryManager::cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor)
-{
-	checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborZ[processNeighbor].f[0], 
-								parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].f[0], 
-								parameter->getD3Qxx() * parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].memsizeFs, 
-								cudaMemcpyDeviceToHost));
+void CudaMemoryManager::cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor,
+                                                     const unsigned int &memsizeFsRecv, int streamIndex)
+{
+    if (streamIndex == -1)
+	    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].f[0], 
+	    							parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].f[0], 
+	    							parameter->getD3Qxx() * memsizeFsRecv, 
+	    							cudaMemcpyHostToDevice));
+    else
+        checkCudaErrors( cudaMemcpyAsync(parameter->getParD(lev)->recvProcessNeighborZ[processNeighbor].f[0], 
+	    				                 parameter->getParH(lev)->recvProcessNeighborZ[processNeighbor].f[0], 
+	    				                 parameter->getD3Qxx() * memsizeFsRecv, 
+	    				                 cudaMemcpyHostToDevice, 
+                                         parameter->getStreamManager()->getStream(streamIndex)));
+}
+void CudaMemoryManager::cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor,
+                                                     const unsigned int &memsizeFsSend, int streamIndex)
+{   
+    if (streamIndex == -1)
+        checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->sendProcessNeighborZ[processNeighbor].f[0], 
+	        					    parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].f[0], 
+	        					    parameter->getD3Qxx() * memsizeFsSend, 
+	        					    cudaMemcpyDeviceToHost));
+    else
+        checkCudaErrors( cudaMemcpyAsync(parameter->getParH(lev)->sendProcessNeighborZ[processNeighbor].f[0], 
+	        						     parameter->getParD(lev)->sendProcessNeighborZ[processNeighbor].f[0], 
+	        						     parameter->getD3Qxx() * memsizeFsSend, 
+	        						     cudaMemcpyDeviceToHost, 
+                                         parameter->getStreamManager()->getStream(streamIndex)));
 }
 void CudaMemoryManager::cudaFreeProcessNeighborZ(int lev, unsigned int processNeighbor)
 {
@@ -929,6 +978,74 @@ void CudaMemoryManager::cudaFreeTurbulentViscosity(int lev)
     // checkCudaErrors(cudaFreeHost(parameter->getParH(lev)->gDyvz));
     // checkCudaErrors(cudaFreeHost(parameter->getParH(lev)->gDzvz));
 }
+//turbulence intensity
+void CudaMemoryManager::cudaAllocTurbulenceIntensity(int lev, uint size)
+{
+    uint mem_size = sizeof(real) * size;
+    // Host
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vxx        ), mem_size));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vyy        ), mem_size));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vzz        ), mem_size));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vxy        ), mem_size));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vxz        ), mem_size));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vyz        ), mem_size));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vx_mean    ), mem_size));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vy_mean    ), mem_size));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->vz_mean    ), mem_size));
+    //Device
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vxx            ), mem_size));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vyy            ), mem_size));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vzz            ), mem_size));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vxy            ), mem_size));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vxz            ), mem_size));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vyz            ), mem_size));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vx_mean        ), mem_size));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vy_mean        ), mem_size));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->vz_mean        ), mem_size));
+    //////////////////////////////////////////////////////////////////////////
+    double tmp = 9. * (double)mem_size;
+    setMemsizeGPU(tmp, false);
+}
+void CudaMemoryManager::cudaCopyTurbulenceIntensityHD(int lev, uint size)
+{
+    uint mem_size = sizeof(real) * size;
+    //copy host to device
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vxx    ,  parameter->getParH(lev)->vxx    ,  mem_size , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vyy    ,  parameter->getParH(lev)->vyy    ,  mem_size , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vzz    ,  parameter->getParH(lev)->vzz    ,  mem_size , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vxy    ,  parameter->getParH(lev)->vxy    ,  mem_size , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vxz    ,  parameter->getParH(lev)->vxz    ,  mem_size , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vyz    ,  parameter->getParH(lev)->vyz    ,  mem_size , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vx_mean,  parameter->getParH(lev)->vx_mean,  mem_size , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vy_mean,  parameter->getParH(lev)->vy_mean,  mem_size , cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->vz_mean,  parameter->getParH(lev)->vz_mean,  mem_size , cudaMemcpyHostToDevice));
+}
+void CudaMemoryManager::cudaCopyTurbulenceIntensityDH(int lev, uint size)
+{
+    uint mem_size = sizeof(real) * size;
+    //copy device to host
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vxx    ,  parameter->getParD(lev)->vxx    ,  mem_size , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vyy    ,  parameter->getParD(lev)->vyy    ,  mem_size , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vzz    ,  parameter->getParD(lev)->vzz    ,  mem_size , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vxy    ,  parameter->getParD(lev)->vxy    ,  mem_size , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vxz    ,  parameter->getParD(lev)->vxz    ,  mem_size , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vyz    ,  parameter->getParD(lev)->vyz    ,  mem_size , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vx_mean,  parameter->getParD(lev)->vx_mean,  mem_size , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vy_mean,  parameter->getParD(lev)->vy_mean,  mem_size , cudaMemcpyDeviceToHost));
+    checkCudaErrors( cudaMemcpy(parameter->getParH(lev)->vz_mean,  parameter->getParD(lev)->vz_mean,  mem_size , cudaMemcpyDeviceToHost));
+}
+void CudaMemoryManager::cudaFreeTurbulenceIntensity(int lev)
+{
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->vxx     ));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->vyy     ));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->vzz     ));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->vxy     ));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->vxz     ));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->vyz     ));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->vx_mean ));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->vy_mean ));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->vz_mean ));
+}
 //median
 void CudaMemoryManager::cudaAllocMedianSP(int lev)
 {
@@ -1023,6 +1140,14 @@ void CudaMemoryManager::cudaCopyInterfaceFC(int lev)
     checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->intFC.ICellFCF, parameter->getParH(lev)->intFC.ICellFCF, parameter->getParH(lev)->mem_size_kFC, cudaMemcpyHostToDevice));
     checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->intFC.ICellFCC, parameter->getParH(lev)->intFC.ICellFCC, parameter->getParH(lev)->mem_size_kFC, cudaMemcpyHostToDevice));
 }
+void CudaMemoryManager::cudaCheckInterfaceFCBulk(int lev)
+{
+    // only use for testing!
+    size_t memsize = sizeof(uint) * parameter->getParH(lev)->intFCBulk.kFC;
+    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->intFCBulk.ICellFCC, parameter->getParH(lev)->intFCBulk.ICellFCC, memsize, cudaMemcpyDeviceToDevice));
+    for (uint i = 0; i < parameter->getParH(lev)->intFCBulk.kFC; i++)
+        printf("%d %d\n", i, parameter->getParH(lev)->intFCBulk.ICellFCC[i]);
+}
 void CudaMemoryManager::cudaFreeInterfaceFC(int lev)
 {
     checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->intFC.ICellFCF));
@@ -2748,6 +2873,7 @@ void CudaMemoryManager::cudaFreeProcessNeighborADZ(int lev, unsigned int process
     checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->recvProcessNeighborADZ[processNeighbor].index  ));
     checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->recvProcessNeighborADZ[processNeighbor].f[0]     ));
 }
+
 void CudaMemoryManager::cudaAlloc2ndOrderDerivitivesIsoTest(int lev)
 {
     //Host
@@ -2785,6 +2911,51 @@ void CudaMemoryManager::cudaFree2ndOrderDerivitivesIsoTest(int lev)
     checkCudaErrors(cudaFreeHost(parameter->getParH(lev)->dzzUz));
     
 }
+
+void CudaMemoryManager::cudaAllocFluidNodeIndices(int lev) {
+    uint mem_size_geo_fluid_nodes = sizeof(uint) * parameter->getParH(lev)->numberOfFluidNodes;
+    // Host
+    checkCudaErrors(cudaMallocHost((void **)&(parameter->getParH(lev)->fluidNodeIndices), mem_size_geo_fluid_nodes));
+    // Device
+    checkCudaErrors(cudaMalloc((void **)&(parameter->getParD(lev)->fluidNodeIndices), mem_size_geo_fluid_nodes));
+    //////////////////////////////////////////////////////////////////////////
+    setMemsizeGPU((double)mem_size_geo_fluid_nodes, false);
+}
+
+void CudaMemoryManager::cudaCopyFluidNodeIndices(int lev) {
+    uint mem_size_geo_fluid_nodes = sizeof(uint) * parameter->getParH(lev)->numberOfFluidNodes;
+    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->fluidNodeIndices,
+                               parameter->getParH(lev)->fluidNodeIndices,
+                               mem_size_geo_fluid_nodes, cudaMemcpyHostToDevice));
+}
+
+void CudaMemoryManager::cudaFreeFluidNodeIndices(int lev) {
+    checkCudaErrors(cudaFreeHost(parameter->getParH(lev)->fluidNodeIndices));
+}
+
+void CudaMemoryManager::cudaAllocFluidNodeIndicesBorder(int lev) {
+    uint mem_size_fluid_nodes_border = sizeof(uint) * parameter->getParH(lev)->numberOffluidNodesBorder;
+    // Host
+    checkCudaErrors(
+        cudaMallocHost((void **)&(parameter->getParH(lev)->fluidNodeIndicesBorder), mem_size_fluid_nodes_border));
+    // Device
+    checkCudaErrors(
+        cudaMalloc((void **)&(parameter->getParD(lev)->fluidNodeIndicesBorder), mem_size_fluid_nodes_border));
+    //////////////////////////////////////////////////////////////////////////
+    setMemsizeGPU((double)mem_size_fluid_nodes_border, false);
+}
+
+void CudaMemoryManager::cudaCopyFluidNodeIndicesBorder(int lev) {
+    uint mem_size_fluid_nodes_border = sizeof(uint) * parameter->getParH(lev)->numberOffluidNodesBorder;
+    checkCudaErrors(cudaMemcpy(parameter->getParD(lev)->fluidNodeIndicesBorder,
+                               parameter->getParH(lev)->fluidNodeIndicesBorder,
+                               mem_size_fluid_nodes_border, cudaMemcpyHostToDevice));
+}
+
+void CudaMemoryManager::cudaFreeFluidNodeIndicesBorder(int lev) {
+    checkCudaErrors(cudaFreeHost(parameter->getParH(lev)->fluidNodeIndicesBorder));
+}
+
 ////////////////////////////////////////////////////////////////////////////////////
 //  ActuatorLine
 ///////////////////////////////////////////////////////////////////////////////
@@ -2792,7 +2963,7 @@ void CudaMemoryManager::cudaFree2ndOrderDerivitivesIsoTest(int lev)
 void CudaMemoryManager::cudaAllocBladeRadii(ActuatorLine* actuatorLine)
 {
     checkCudaErrors( cudaMallocHost((void**) &actuatorLine->bladeRadiiH, sizeof(real)*actuatorLine->getNBladeNodes()) );
-
+    
     checkCudaErrors( cudaMalloc((void**) &actuatorLine->bladeRadiiD, sizeof(real)*actuatorLine->getNBladeNodes()) );
 
     setMemsizeGPU(sizeof(real)*actuatorLine->getNBladeNodes(), false);
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
index 27b16240cb63b4505017a7dc50e3a5fc9b19ce82..2f580435c14807371907bdf3450364419dcf8735 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
+++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
@@ -91,22 +91,29 @@ public:
 	//////////////////////////////////////////////////////////////////////////
 	//3D domain decomposition
 	void cudaAllocProcessNeighborX(int lev, unsigned int processNeighbor);
-	void cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor);
-	void cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor);
+    void cudaCopyProcessNeighborXFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv,
+                                      int streamIndex);
+    void cudaCopyProcessNeighborXFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend,
+                                      int streamIndex);
 	void cudaCopyProcessNeighborXIndex(int lev, unsigned int processNeighbor);
 	void cudaFreeProcessNeighborX(int lev, unsigned int processNeighbor);
 	//
 	void cudaAllocProcessNeighborY(int lev, unsigned int processNeighbor);
-	void cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor);
-	void cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor);
+    void cudaCopyProcessNeighborYFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv,
+                                      int streamIndex);
+    void cudaCopyProcessNeighborYFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend,
+                                      int streamIndex);
 	void cudaCopyProcessNeighborYIndex(int lev, unsigned int processNeighbor);
-	void cudaFreeProcessNeighborY(int lev, unsigned int processNeighbor);
+    void cudaFreeProcessNeighborY(int lev, unsigned int processNeighbor);
 	//
 	void cudaAllocProcessNeighborZ(int lev, unsigned int processNeighbor);
-	void cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor);
-	void cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor);
+    void cudaCopyProcessNeighborZFsHD(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsRecv,
+                                      int streamIndex);
+    void cudaCopyProcessNeighborZFsDH(int lev, unsigned int processNeighbor, const unsigned int &memsizeFsSend,
+                                      int streamIndex);
 	void cudaCopyProcessNeighborZIndex(int lev, unsigned int processNeighbor);
 	void cudaFreeProcessNeighborZ(int lev, unsigned int processNeighbor);
+
 	//////////////////////////////////////////////////////////////////////////
 
 	//////////////////////////////////////////////////////////////////////////
@@ -134,6 +141,11 @@ public:
     void cudaCopyTurbulentViscosityHD(int lev);
     void cudaCopyTurbulentViscosityDH(int lev);
     void cudaFreeTurbulentViscosity(int lev);
+
+    void cudaAllocTurbulenceIntensity(int lev, uint size);
+    void cudaCopyTurbulenceIntensityHD(int lev, uint size);
+    void cudaCopyTurbulenceIntensityDH(int lev, uint size);
+    void cudaFreeTurbulenceIntensity(int lev);
     
     void cudaAllocMedianSP(int lev);
     void cudaCopyMedianSP(int lev);
@@ -151,6 +163,7 @@ public:
     
     void cudaAllocInterfaceFC(int lev);
     void cudaCopyInterfaceFC(int lev);
+    void cudaCheckInterfaceFCBulk(int lev);
     void cudaFreeInterfaceFC(int lev);
     
     void cudaAllocInterfaceOffCF(int lev);
@@ -331,7 +344,15 @@ public:
     void cudaCopyProcessNeighborADZFsDH(int lev, unsigned int processNeighbor);
     void cudaCopyProcessNeighborADZIndex(int lev, unsigned int processNeighbor);
     void cudaFreeProcessNeighborADZ(int lev, unsigned int processNeighbor);
-
+    
+    void cudaAllocFluidNodeIndices(int lev);
+    void cudaCopyFluidNodeIndices(int lev);
+    void cudaFreeFluidNodeIndices(int lev);
+    void cudaAllocFluidNodeIndicesBorder(int lev);
+    void cudaCopyFluidNodeIndicesBorder(int lev);
+    void cudaFreeFluidNodeIndicesBorder(int lev);  
+      
+    // Actuator Line
     void cudaAllocBladeRadii(ActuatorLine* actuatorLine);
     void cudaCopyBladeRadiiHtoD(ActuatorLine* actuatorLine);
     void cudaCopyBladeRadiiDtoH(ActuatorLine* actuatorLine);
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h b/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
index f7b89610d09cec436ebc6cb0e4473dbf6245c847..18e26b87ce7831c6394b852a593b744b29f4afae 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
+++ b/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
@@ -8,10 +8,13 @@
 //random numbers
 #include <curand.h>
 #include <curand_kernel.h>
+#include <cuda_runtime.h>
 
 #include <DataTypes.h>
 #include "LBM/LB.h"
 
+#ifndef GPU_INTERFACE_H
+#define GPU_INTERFACE_H
 
 //////////////////////////////////////////////////////////////////////////
 //Kernel
@@ -1913,7 +1916,8 @@ extern "C" void ScaleCF_RhoSq_comp_27(  real* DC,
 										unsigned int nxF, 
 										unsigned int nyF,
 										unsigned int numberOfThreads,
-										OffCF offCF);
+										OffCF offCF,
+                                        CUstream_st *stream);
 
 extern "C" void ScaleCF_RhoSq_3rdMom_comp_27( real* DC, 
 											  real* DF, 
@@ -2154,8 +2158,9 @@ extern "C" void ScaleFC_RhoSq_comp_27(  real* DC,
 										unsigned int nyC, 
 										unsigned int nxF, 
 										unsigned int nyF,
-										unsigned int numberOfThreads,
-										OffFC offFC);
+										unsigned int numberOfThreads, 
+	                                    OffFC offFC,
+                                        CUstream_st *stream);
 
 extern "C" void ScaleFC_RhoSq_3rdMom_comp_27( real* DC, 
 											  real* DF, 
@@ -2412,7 +2417,8 @@ extern "C" void GetSendFsPreDev27(real* DD,
 								  unsigned int* neighborZ,
 								  unsigned int size_Mat, 
 								  bool evenOrOdd,
-								  unsigned int numberOfThreads);
+								  unsigned int numberOfThreads, 
+	                              cudaStream_t stream = CU_STREAM_LEGACY);
 
 extern "C" void GetSendFsPostDev27(real* DD,
 								   real* bufferFs,
@@ -2423,7 +2429,8 @@ extern "C" void GetSendFsPostDev27(real* DD,
 								   unsigned int* neighborZ,
 								   unsigned int size_Mat, 
 								   bool evenOrOdd,
-								   unsigned int numberOfThreads);
+								   unsigned int numberOfThreads, 
+	                               cudaStream_t stream = CU_STREAM_LEGACY);
 
 extern "C" void SetRecvFsPreDev27(real* DD,
 								  real* bufferFs,
@@ -2433,8 +2440,8 @@ extern "C" void SetRecvFsPreDev27(real* DD,
 								  unsigned int* neighborY,
 								  unsigned int* neighborZ,
 								  unsigned int size_Mat, 
-								  bool evenOrOdd,
-								  unsigned int numberOfThreads);
+								  bool evenOrOdd, unsigned int numberOfThreads, 
+	                              cudaStream_t stream = CU_STREAM_LEGACY);
 
 extern "C" void SetRecvFsPostDev27(real* DD,
 								   real* bufferFs,
@@ -2445,7 +2452,8 @@ extern "C" void SetRecvFsPostDev27(real* DD,
 								   unsigned int* neighborZ,
 								   unsigned int size_Mat, 
 								   bool evenOrOdd,
-								   unsigned int numberOfThreads);
+								   unsigned int numberOfThreads,
+                                   cudaStream_t stream = CU_STREAM_LEGACY);
 
 extern "C" void getSendGsDevF3(
 	real* G6,
@@ -2584,3 +2592,23 @@ extern "C" void generateRandomValuesDevice(curandState* state,
 										   real* randArray,
 										   unsigned int numberOfThreads);
 
+extern "C" void CalcTurbulenceIntensityDevice(
+   real* vxx,
+   real* vyy,
+   real* vzz,
+   real* vxy,
+   real* vxz,
+   real* vyz,
+   real* vx_mean,
+   real* vy_mean,
+   real* vz_mean,
+   real* DD, 
+   uint *typeOfGridNode, 
+   unsigned int* neighborX,
+   unsigned int* neighborY,
+   unsigned int* neighborZ,
+   unsigned int size_Mat, 
+   bool evenOrOdd,
+   uint numberOfThreads);
+
+#endif
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh b/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
index d7d38baf2bcf6f5d3abe342359b7676f4ad8266b..380f8d3871df7f459a9cc2c885bab0e82c01835d 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
+++ b/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
@@ -2482,6 +2482,23 @@ extern "C" __global__ void initRandom(curandState* state);
 extern "C" __global__ void generateRandomValues(curandState* state, 
 												real* randArray);
 
+extern "C" __global__ void CalcTurbulenceIntensity(
+   real* vxx,
+   real* vyy,
+   real* vzz,
+   real* vxy,
+   real* vxz,
+   real* vyz,
+   real* vx_mean,
+   real* vy_mean,
+   real* vz_mean,
+   real* DD, 
+   uint *typeOfGridNode, 
+   unsigned int* neighborX,
+   unsigned int* neighborY,
+   unsigned int* neighborZ,
+   unsigned int size_Mat, 
+   bool evenOrOdd);
 
 #endif
 							 
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu b/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
index 4dce487fc98ee077798f7f75bfbf96906e7585b0..627b78814051313a2d463c09153de416cc219363 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
@@ -5312,7 +5312,8 @@ extern "C" void ScaleCF_RhoSq_comp_27(   real* DC,
 										 unsigned int nxF, 
 										 unsigned int nyF,
 										 unsigned int numberOfThreads,
-										 OffCF offCF)
+										 OffCF offCF,
+                                         CUstream_st *stream)
 {
    int Grid = (kCF / numberOfThreads)+1;
    int Grid1, Grid2;
@@ -5329,7 +5330,7 @@ extern "C" void ScaleCF_RhoSq_comp_27(   real* DC,
    dim3 gridINT_CF(Grid1, Grid2);
    dim3 threads(numberOfThreads, 1, 1 );
 
-      scaleCF_RhoSq_comp_27<<< gridINT_CF, threads >>>( DC,  
+      scaleCF_RhoSq_comp_27<<< gridINT_CF, threads, 0, stream >>>( DC,  
 														DF, 
 														neighborCX,
 														neighborCY,
@@ -6382,7 +6383,8 @@ extern "C" void ScaleFC_RhoSq_comp_27(real* DC,
 									  unsigned int nxF, 
 									  unsigned int nyF,
 									  unsigned int numberOfThreads,
-									  OffFC offFC)
+									  OffFC offFC,
+                                      CUstream_st *stream)
 {
    int Grid = (kFC / numberOfThreads)+1;
    int Grid1, Grid2;
@@ -6399,7 +6401,8 @@ extern "C" void ScaleFC_RhoSq_comp_27(real* DC,
    dim3 gridINT_FC(Grid1, Grid2);
    dim3 threads(numberOfThreads, 1, 1 );
 
-      scaleFC_RhoSq_comp_27<<< gridINT_FC, threads >>>(DC, 
+      scaleFC_RhoSq_comp_27<<<gridINT_FC, threads, 0, stream>>>(
+													   DC, 
 													   DF, 
 													   neighborCX,
 													   neighborCY,
@@ -6423,6 +6426,7 @@ extern "C" void ScaleFC_RhoSq_comp_27(real* DC,
 													   offFC);
       getLastCudaError("scaleFC_RhoSq_27 execution failed"); 
 }
+
 //////////////////////////////////////////////////////////////////////////
 extern "C" void ScaleFC_RhoSq_3rdMom_comp_27( real* DC, 
 											  real* DF, 
@@ -6961,7 +6965,8 @@ extern "C" void GetSendFsPreDev27(real* DD,
 								  unsigned int* neighborZ,
 								  unsigned int size_Mat, 
 								  bool evenOrOdd,
-								  unsigned int numberOfThreads)
+								  unsigned int numberOfThreads,
+								  cudaStream_t stream)
 {
 	int Grid = (buffmax / numberOfThreads)+1;
 	int Grid1, Grid2;
@@ -6978,7 +6983,7 @@ extern "C" void GetSendFsPreDev27(real* DD,
 	dim3 grid(Grid1, Grid2);
 	dim3 threads(numberOfThreads, 1, 1 );
 
-	getSendFsPre27<<< grid, threads >>>(DD, 
+	getSendFsPre27<<< grid, threads, 0, stream >>>(DD, 
 										bufferFs, 
 										sendIndex, 
 										buffmax,
@@ -6999,7 +7004,8 @@ extern "C" void GetSendFsPostDev27(real* DD,
 								   unsigned int* neighborZ,
 								   unsigned int size_Mat, 
 								   bool evenOrOdd,
-								   unsigned int numberOfThreads)
+								   unsigned int numberOfThreads, 
+								   cudaStream_t stream)
 {
 	int Grid = (buffmax / numberOfThreads)+1;
 	int Grid1, Grid2;
@@ -7016,7 +7022,7 @@ extern "C" void GetSendFsPostDev27(real* DD,
 	dim3 grid(Grid1, Grid2);
 	dim3 threads(numberOfThreads, 1, 1 );
 
-	getSendFsPost27<<< grid, threads >>>(DD, 
+	getSendFsPost27<<< grid, threads, 0, stream >>>(DD, 
 										 bufferFs, 
 										 sendIndex, 
 										 buffmax,
@@ -7037,7 +7043,8 @@ extern "C" void SetRecvFsPreDev27(real* DD,
 								  unsigned int* neighborZ,
 								  unsigned int size_Mat, 
 								  bool evenOrOdd,
-								  unsigned int numberOfThreads)
+								  unsigned int numberOfThreads, 
+	                              cudaStream_t stream)
 {
 	int Grid = (buffmax / numberOfThreads)+1;
 	int Grid1, Grid2;
@@ -7054,7 +7061,7 @@ extern "C" void SetRecvFsPreDev27(real* DD,
 	dim3 grid(Grid1, Grid2);
 	dim3 threads(numberOfThreads, 1, 1 );
 
-	setRecvFsPre27<<< grid, threads >>>(DD, 
+	setRecvFsPre27<<< grid, threads, 0, stream >>>(DD, 
 										bufferFs, 
 										recvIndex, 
 										buffmax,
@@ -7074,8 +7081,9 @@ extern "C" void SetRecvFsPostDev27(real* DD,
 								   unsigned int* neighborY,
 								   unsigned int* neighborZ,
 								   unsigned int size_Mat, 
-								   bool evenOrOdd,
-								   unsigned int numberOfThreads)
+								   bool evenOrOdd, 
+	                               unsigned int numberOfThreads, 
+	                               cudaStream_t stream)
 {
 	int Grid = (buffmax / numberOfThreads)+1;
 	int Grid1, Grid2;
@@ -7092,7 +7100,7 @@ extern "C" void SetRecvFsPostDev27(real* DD,
 	dim3 grid(Grid1, Grid2);
 	dim3 threads(numberOfThreads, 1, 1 );
 
-	setRecvFsPost27<<< grid, threads >>>(DD, 
+	setRecvFsPost27<<< grid, threads, 0, stream >>>(DD, 
 										 bufferFs, 
 										 recvIndex, 
 										 buffmax,
@@ -7518,6 +7526,61 @@ extern "C" void generateRandomValuesDevice( curandState* state,
    generateRandomValues<<< gridQ, threads >>> (state,randArray);
    getLastCudaError("generateRandomValues execution failed"); 
 }
+//////////////////////////////////////////////////////////////////////////
+extern "C" void CalcTurbulenceIntensityDevice(
+   real* vxx,
+   real* vyy,
+   real* vzz,
+   real* vxy,
+   real* vxz,
+   real* vyz,
+   real* vx_mean,
+   real* vy_mean,
+   real* vz_mean,
+   real* DD, 
+   uint* typeOfGridNode, 
+   unsigned int* neighborX,
+   unsigned int* neighborY,
+   unsigned int* neighborZ,
+   unsigned int size_Mat, 
+   bool evenOrOdd,
+   uint numberOfThreads)
+{
+   int Grid = (size_Mat / numberOfThreads)+1;
+   int Grid1, Grid2;
+   if (Grid>512)
+   {
+      Grid1 = 512;
+      Grid2 = (Grid/Grid1)+1;
+   } 
+   else
+   {
+      Grid1 = 1;
+      Grid2 = Grid;
+   }
+   dim3 gridQ(Grid1, Grid2);
+   dim3 threads(numberOfThreads, 1, 1 );
+
+   CalcTurbulenceIntensity<<<gridQ, threads>>>(
+     vxx,
+     vyy,
+     vzz,
+	 vxy,
+     vxz,
+     vyz,
+     vx_mean,
+     vy_mean,
+     vz_mean,
+     DD, 
+     typeOfGridNode, 
+     neighborX,
+     neighborY,
+     neighborZ,
+     size_Mat, 
+     evenOrOdd);
+
+   getLastCudaError("CalcTurbulenceIntensity execution failed"); 
+}
 
 
 
diff --git a/src/gpu/VirtualFluids_GPU/GPU/ScaleFC27.cu b/src/gpu/VirtualFluids_GPU/GPU/ScaleFC27.cu
index 06d5086cc7f57c8245517f922662e8eea4571d05..f61a6f980bd8684665c32d06f53efcb7f9dc0070 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/ScaleFC27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/ScaleFC27.cu
@@ -9586,6 +9586,1475 @@ extern "C" __global__ void scaleFC_RhoSq_3rdMom_comp_27(real* DC,
 
 
 //////////////////////////////////////////////////////////////////////////
+__device__ void scaleFC_RhoSq_comp_27_Calculation(real *DC, real *DF, unsigned int *neighborCX, unsigned int *neighborCY,
+                                                  unsigned int *neighborCZ, unsigned int *neighborFX, unsigned int *neighborFY,
+                                                  unsigned int *neighborFZ, unsigned int size_MatC, unsigned int size_MatF,
+                                                  bool evenOrOdd, unsigned int *posC, unsigned int *posFSWB, unsigned int kFC,
+                                                  real omCoarse, real omFine, real nu, unsigned int nxC, unsigned int nyC,
+                                                  unsigned int nxF, unsigned int nyF, OffFC offFC, const unsigned k)
+{
+    real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF,
+        *fbnF, *ftsF, *fzeroF, *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
+
+    feF    = &DF[dirE * size_MatF];
+    fwF    = &DF[dirW * size_MatF];
+    fnF    = &DF[dirN * size_MatF];
+    fsF    = &DF[dirS * size_MatF];
+    ftF    = &DF[dirT * size_MatF];
+    fbF    = &DF[dirB * size_MatF];
+    fneF   = &DF[dirNE * size_MatF];
+    fswF   = &DF[dirSW * size_MatF];
+    fseF   = &DF[dirSE * size_MatF];
+    fnwF   = &DF[dirNW * size_MatF];
+    fteF   = &DF[dirTE * size_MatF];
+    fbwF   = &DF[dirBW * size_MatF];
+    fbeF   = &DF[dirBE * size_MatF];
+    ftwF   = &DF[dirTW * size_MatF];
+    ftnF   = &DF[dirTN * size_MatF];
+    fbsF   = &DF[dirBS * size_MatF];
+    fbnF   = &DF[dirBN * size_MatF];
+    ftsF   = &DF[dirTS * size_MatF];
+    fzeroF = &DF[dirZERO * size_MatF];
+    ftneF  = &DF[dirTNE * size_MatF];
+    ftswF  = &DF[dirTSW * size_MatF];
+    ftseF  = &DF[dirTSE * size_MatF];
+    ftnwF  = &DF[dirTNW * size_MatF];
+    fbneF  = &DF[dirBNE * size_MatF];
+    fbswF  = &DF[dirBSW * size_MatF];
+    fbseF  = &DF[dirBSE * size_MatF];
+    fbnwF  = &DF[dirBNW * size_MatF];
+
+    real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC,
+        *fbnC, *ftsC, *fzeroC, *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
+
+    if (evenOrOdd == true) {
+        feC    = &DC[dirE * size_MatC];
+        fwC    = &DC[dirW * size_MatC];
+        fnC    = &DC[dirN * size_MatC];
+        fsC    = &DC[dirS * size_MatC];
+        ftC    = &DC[dirT * size_MatC];
+        fbC    = &DC[dirB * size_MatC];
+        fneC   = &DC[dirNE * size_MatC];
+        fswC   = &DC[dirSW * size_MatC];
+        fseC   = &DC[dirSE * size_MatC];
+        fnwC   = &DC[dirNW * size_MatC];
+        fteC   = &DC[dirTE * size_MatC];
+        fbwC   = &DC[dirBW * size_MatC];
+        fbeC   = &DC[dirBE * size_MatC];
+        ftwC   = &DC[dirTW * size_MatC];
+        ftnC   = &DC[dirTN * size_MatC];
+        fbsC   = &DC[dirBS * size_MatC];
+        fbnC   = &DC[dirBN * size_MatC];
+        ftsC   = &DC[dirTS * size_MatC];
+        fzeroC = &DC[dirZERO * size_MatC];
+        ftneC  = &DC[dirTNE * size_MatC];
+        ftswC  = &DC[dirTSW * size_MatC];
+        ftseC  = &DC[dirTSE * size_MatC];
+        ftnwC  = &DC[dirTNW * size_MatC];
+        fbneC  = &DC[dirBNE * size_MatC];
+        fbswC  = &DC[dirBSW * size_MatC];
+        fbseC  = &DC[dirBSE * size_MatC];
+        fbnwC  = &DC[dirBNW * size_MatC];
+    } else {
+        fwC    = &DC[dirE * size_MatC];
+        feC    = &DC[dirW * size_MatC];
+        fsC    = &DC[dirN * size_MatC];
+        fnC    = &DC[dirS * size_MatC];
+        fbC    = &DC[dirT * size_MatC];
+        ftC    = &DC[dirB * size_MatC];
+        fswC   = &DC[dirNE * size_MatC];
+        fneC   = &DC[dirSW * size_MatC];
+        fnwC   = &DC[dirSE * size_MatC];
+        fseC   = &DC[dirNW * size_MatC];
+        fbwC   = &DC[dirTE * size_MatC];
+        fteC   = &DC[dirBW * size_MatC];
+        ftwC   = &DC[dirBE * size_MatC];
+        fbeC   = &DC[dirTW * size_MatC];
+        fbsC   = &DC[dirTN * size_MatC];
+        ftnC   = &DC[dirBS * size_MatC];
+        ftsC   = &DC[dirBN * size_MatC];
+        fbnC   = &DC[dirTS * size_MatC];
+        fzeroC = &DC[dirZERO * size_MatC];
+        fbswC  = &DC[dirTNE * size_MatC];
+        fbneC  = &DC[dirTSW * size_MatC];
+        fbnwC  = &DC[dirTSE * size_MatC];
+        fbseC  = &DC[dirTNW * size_MatC];
+        ftswC  = &DC[dirBNE * size_MatC];
+        ftneC  = &DC[dirBSW * size_MatC];
+        ftnwC  = &DC[dirBSE * size_MatC];
+        ftseC  = &DC[dirBNW * size_MatC];
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    real eps_new = c2o1;
+    real omegaS  = omFine;   //-omFine;
+    real o       = omCoarse; //-omCoarse;
+    // real op = one;
+    // real cu_sq;
+
+    real xoff, yoff, zoff;
+    real xoff_sq, yoff_sq, zoff_sq;
+
+    real press; //,drho,vx1,vx2,vx3;
+    real /*press_SWT,*/ drho_SWT, vx1_SWT, vx2_SWT, vx3_SWT;
+    real /*press_NWT,*/ drho_NWT, vx1_NWT, vx2_NWT, vx3_NWT;
+    real /*press_NET,*/ drho_NET, vx1_NET, vx2_NET, vx3_NET;
+    real /*press_SET,*/ drho_SET, vx1_SET, vx2_SET, vx3_SET;
+    real /*press_SWB,*/ drho_SWB, vx1_SWB, vx2_SWB, vx3_SWB;
+    real /*press_NWB,*/ drho_NWB, vx1_NWB, vx2_NWB, vx3_NWB;
+    real /*press_NEB,*/ drho_NEB, vx1_NEB, vx2_NEB, vx3_NEB;
+    real /*press_SEB,*/ drho_SEB, vx1_SEB, vx2_SEB, vx3_SEB;
+    real f_E, f_W, f_N, f_S, f_T, f_B, f_NE, f_SW, f_SE, f_NW, f_TE, f_BW, f_BE, f_TW, f_TN, f_BS, f_BN, f_TS, f_ZERO,
+        f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
+    // real
+    // feq_E,feq_W,feq_N,feq_S,feq_T,feq_B,feq_NE,feq_SW,feq_SE,feq_NW,feq_TE,feq_BW,feq_BE,feq_TW,feq_TN,feq_BS,feq_BN,feq_TS,feq_ZERO,feq_TNE,
+    // feq_TSW, feq_TSE, feq_TNW, feq_BNE, feq_BSW, feq_BSE, feq_BNW;
+    real kxyFromfcNEQ_SWT, kyzFromfcNEQ_SWT, kxzFromfcNEQ_SWT, kxxMyyFromfcNEQ_SWT, kxxMzzFromfcNEQ_SWT;
+    real kxyFromfcNEQ_NWT, kyzFromfcNEQ_NWT, kxzFromfcNEQ_NWT, kxxMyyFromfcNEQ_NWT, kxxMzzFromfcNEQ_NWT;
+    real kxyFromfcNEQ_NET, kyzFromfcNEQ_NET, kxzFromfcNEQ_NET, kxxMyyFromfcNEQ_NET, kxxMzzFromfcNEQ_NET;
+    real kxyFromfcNEQ_SET, kyzFromfcNEQ_SET, kxzFromfcNEQ_SET, kxxMyyFromfcNEQ_SET, kxxMzzFromfcNEQ_SET;
+    real kxyFromfcNEQ_SWB, kyzFromfcNEQ_SWB, kxzFromfcNEQ_SWB, kxxMyyFromfcNEQ_SWB, kxxMzzFromfcNEQ_SWB;
+    real kxyFromfcNEQ_NWB, kyzFromfcNEQ_NWB, kxzFromfcNEQ_NWB, kxxMyyFromfcNEQ_NWB, kxxMzzFromfcNEQ_NWB;
+    real kxyFromfcNEQ_NEB, kyzFromfcNEQ_NEB, kxzFromfcNEQ_NEB, kxxMyyFromfcNEQ_NEB, kxxMzzFromfcNEQ_NEB;
+    real kxyFromfcNEQ_SEB, kyzFromfcNEQ_SEB, kxzFromfcNEQ_SEB, kxxMyyFromfcNEQ_SEB, kxxMzzFromfcNEQ_SEB;
+    real a0, ax, ay, az, axx, ayy, azz, axy, axz, ayz, b0, bx, by, bz, bxx, byy, bzz, bxy, bxz, byz, c0, cx, cy, cz,
+        cxx, cyy, czz, cxy, cxz, cyz /*, axyz, bxyz, cxyz*/;
+    real d0, dx, dy, dz, dxy, dxz, dyz /*, dxyz*/;
+
+    if (k < kFC) {
+        //////////////////////////////////////////////////////////////////////////
+        xoff    = offFC.xOffFC[k];
+        yoff    = offFC.yOffFC[k];
+        zoff    = offFC.zOffFC[k];
+        xoff_sq = xoff * xoff;
+        yoff_sq = yoff * yoff;
+        zoff_sq = zoff * zoff;
+        //////////////////////////////////////////////////////////////////////////
+        // SWB//
+        //////////////////////////////////////////////////////////////////////////
+        // index 0
+        unsigned int k0zero = posFSWB[k];
+        unsigned int k0w    = neighborFX[k0zero];
+        unsigned int k0s    = neighborFY[k0zero];
+        unsigned int k0b    = neighborFZ[k0zero];
+        unsigned int k0sw   = neighborFY[k0w];
+        unsigned int k0bw   = neighborFZ[k0w];
+        unsigned int k0bs   = neighborFZ[k0s];
+        unsigned int k0bsw  = neighborFZ[k0sw];
+        //////////////////////////////////////////////////////////////////////////
+        // index
+        unsigned int kzero = k0zero;
+        unsigned int kw    = k0w;
+        unsigned int ks    = k0s;
+        unsigned int kb    = k0b;
+        unsigned int ksw   = k0sw;
+        unsigned int kbw   = k0bw;
+        unsigned int kbs   = k0bs;
+        unsigned int kbsw  = k0bsw;
+        ////////////////////////////////////////////////////////////////////////////////
+        f_E    = feF[kzero];
+        f_W    = fwF[kw];
+        f_N    = fnF[kzero];
+        f_S    = fsF[ks];
+        f_T    = ftF[kzero];
+        f_B    = fbF[kb];
+        f_NE   = fneF[kzero];
+        f_SW   = fswF[ksw];
+        f_SE   = fseF[ks];
+        f_NW   = fnwF[kw];
+        f_TE   = fteF[kzero];
+        f_BW   = fbwF[kbw];
+        f_BE   = fbeF[kb];
+        f_TW   = ftwF[kw];
+        f_TN   = ftnF[kzero];
+        f_BS   = fbsF[kbs];
+        f_BN   = fbnF[kb];
+        f_TS   = ftsF[ks];
+        f_ZERO = fzeroF[kzero];
+        f_TNE  = ftneF[kzero];
+        f_TSW  = ftswF[ksw];
+        f_TSE  = ftseF[ks];
+        f_TNW  = ftnwF[kw];
+        f_BNE  = fbneF[kb];
+        f_BSW  = fbswF[kbsw];
+        f_BSE  = fbseF[kbs];
+        f_BNW  = fbnwF[kbw];
+
+        drho_SWB = f_E + f_W + f_N + f_S + f_T + f_B + f_NE + f_SW + f_SE + f_NW + f_TE + f_BW + f_BE + f_TW + f_TN +
+                   f_BS + f_BN + f_TS + f_ZERO + f_TNE + f_TSW + f_TSE + f_TNW + f_BNE + f_BSW + f_BSE + f_BNW;
+        vx1_SWB = (((f_TNE - f_BSW) + (f_TSE - f_BNW) + (f_BNE - f_TSW) + (f_BSE - f_TNW)) +
+                   (((f_NE - f_SW) + (f_TE - f_BW)) + ((f_SE - f_NW) + (f_BE - f_TW))) + (f_E - f_W)) /
+                  (c1o1 + drho_SWB);
+        vx2_SWB = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_BNE - f_TSW) + (f_BNW - f_TSE)) +
+                   (((f_NE - f_SW) + (f_TN - f_BS)) + ((f_BN - f_TS) + (f_NW - f_SE))) + (f_N - f_S)) /
+                  (c1o1 + drho_SWB);
+        vx3_SWB = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_TSE - f_BNW) + (f_TSW - f_BNE)) +
+                   (((f_TE - f_BW) + (f_TN - f_BS)) + ((f_TW - f_BE) + (f_TS - f_BN))) + (f_T - f_B)) /
+                  (c1o1 + drho_SWB);
+
+        kxyFromfcNEQ_SWB =
+            -c3o1 * omegaS *
+            ((f_SW + f_BSW + f_TSW - f_NW - f_BNW - f_TNW - f_SE - f_BSE - f_TSE + f_NE + f_BNE + f_TNE) /
+                 (c1o1 + drho_SWB) -
+             ((vx1_SWB * vx2_SWB)));
+        kyzFromfcNEQ_SWB =
+            -c3o1 * omegaS *
+            ((f_BS + f_BSE + f_BSW - f_TS - f_TSE - f_TSW - f_BN - f_BNE - f_BNW + f_TN + f_TNE + f_TNW) /
+                 (c1o1 + drho_SWB) -
+             ((vx2_SWB * vx3_SWB)));
+        kxzFromfcNEQ_SWB =
+            -c3o1 * omegaS *
+            ((f_BW + f_BSW + f_BNW - f_TW - f_TSW - f_TNW - f_BE - f_BSE - f_BNE + f_TE + f_TSE + f_TNE) /
+                 (c1o1 + drho_SWB) -
+             ((vx1_SWB * vx3_SWB)));
+        kxxMyyFromfcNEQ_SWB =
+            -c3o2 * omegaS *
+            ((f_BW + f_W + f_TW - f_BS - f_S - f_TS - f_BN - f_N - f_TN + f_BE + f_E + f_TE) / (c1o1 + drho_SWB) -
+             ((vx1_SWB * vx1_SWB - vx2_SWB * vx2_SWB)));
+        kxxMzzFromfcNEQ_SWB =
+            -c3o2 * omegaS *
+            ((f_SW + f_W + f_NW - f_BS - f_TS - f_B - f_T - f_BN - f_TN + f_SE + f_E + f_NE) / (c1o1 + drho_SWB) -
+             ((vx1_SWB * vx1_SWB - vx3_SWB * vx3_SWB)));
+
+        //////////////////////////////////////////////////////////////////////////
+        // SWT//
+        //////////////////////////////////////////////////////////////////////////
+        // index
+        kzero = kb;
+        kw    = kbw;
+        ks    = kbs;
+        kb    = neighborFZ[kb];
+        ksw   = kbsw;
+        kbw   = neighborFZ[kbw];
+        kbs   = neighborFZ[kbs];
+        kbsw  = neighborFZ[kbsw];
+        ////////////////////////////////////////////////////////////////////////////////
+        f_E    = feF[kzero];
+        f_W    = fwF[kw];
+        f_N    = fnF[kzero];
+        f_S    = fsF[ks];
+        f_T    = ftF[kzero];
+        f_B    = fbF[kb];
+        f_NE   = fneF[kzero];
+        f_SW   = fswF[ksw];
+        f_SE   = fseF[ks];
+        f_NW   = fnwF[kw];
+        f_TE   = fteF[kzero];
+        f_BW   = fbwF[kbw];
+        f_BE   = fbeF[kb];
+        f_TW   = ftwF[kw];
+        f_TN   = ftnF[kzero];
+        f_BS   = fbsF[kbs];
+        f_BN   = fbnF[kb];
+        f_TS   = ftsF[ks];
+        f_ZERO = fzeroF[kzero];
+        f_TNE  = ftneF[kzero];
+        f_TSW  = ftswF[ksw];
+        f_TSE  = ftseF[ks];
+        f_TNW  = ftnwF[kw];
+        f_BNE  = fbneF[kb];
+        f_BSW  = fbswF[kbsw];
+        f_BSE  = fbseF[kbs];
+        f_BNW  = fbnwF[kbw];
+
+        drho_SWT = f_E + f_W + f_N + f_S + f_T + f_B + f_NE + f_SW + f_SE + f_NW + f_TE + f_BW + f_BE + f_TW + f_TN +
+                   f_BS + f_BN + f_TS + f_ZERO + f_TNE + f_TSW + f_TSE + f_TNW + f_BNE + f_BSW + f_BSE + f_BNW;
+        vx1_SWT = (((f_TNE - f_BSW) + (f_TSE - f_BNW) + (f_BNE - f_TSW) + (f_BSE - f_TNW)) +
+                   (((f_NE - f_SW) + (f_TE - f_BW)) + ((f_SE - f_NW) + (f_BE - f_TW))) + (f_E - f_W)) /
+                  (c1o1 + drho_SWT);
+        vx2_SWT = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_BNE - f_TSW) + (f_BNW - f_TSE)) +
+                   (((f_NE - f_SW) + (f_TN - f_BS)) + ((f_BN - f_TS) + (f_NW - f_SE))) + (f_N - f_S)) /
+                  (c1o1 + drho_SWT);
+        vx3_SWT = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_TSE - f_BNW) + (f_TSW - f_BNE)) +
+                   (((f_TE - f_BW) + (f_TN - f_BS)) + ((f_TW - f_BE) + (f_TS - f_BN))) + (f_T - f_B)) /
+                  (c1o1 + drho_SWT);
+
+        kxyFromfcNEQ_SWT =
+            -c3o1 * omegaS *
+            ((f_SW + f_BSW + f_TSW - f_NW - f_BNW - f_TNW - f_SE - f_BSE - f_TSE + f_NE + f_BNE + f_TNE) /
+                 (c1o1 + drho_SWT) -
+             ((vx1_SWT * vx2_SWT)));
+        kyzFromfcNEQ_SWT =
+            -c3o1 * omegaS *
+            ((f_BS + f_BSE + f_BSW - f_TS - f_TSE - f_TSW - f_BN - f_BNE - f_BNW + f_TN + f_TNE + f_TNW) /
+                 (c1o1 + drho_SWT) -
+             ((vx2_SWT * vx3_SWT)));
+        kxzFromfcNEQ_SWT =
+            -c3o1 * omegaS *
+            ((f_BW + f_BSW + f_BNW - f_TW - f_TSW - f_TNW - f_BE - f_BSE - f_BNE + f_TE + f_TSE + f_TNE) /
+                 (c1o1 + drho_SWT) -
+             ((vx1_SWT * vx3_SWT)));
+        kxxMyyFromfcNEQ_SWT =
+            -c3o2 * omegaS *
+            ((f_BW + f_W + f_TW - f_BS - f_S - f_TS - f_BN - f_N - f_TN + f_BE + f_E + f_TE) / (c1o1 + drho_SWT) -
+             ((vx1_SWT * vx1_SWT - vx2_SWT * vx2_SWT)));
+        kxxMzzFromfcNEQ_SWT =
+            -c3o2 * omegaS *
+            ((f_SW + f_W + f_NW - f_BS - f_TS - f_B - f_T - f_BN - f_TN + f_SE + f_E + f_NE) / (c1o1 + drho_SWT) -
+             ((vx1_SWT * vx1_SWT - vx3_SWT * vx3_SWT)));
+
+        //////////////////////////////////////////////////////////////////////////
+        // SET//
+        //////////////////////////////////////////////////////////////////////////
+        // index
+        kzero = kw;
+        kw    = neighborFX[kw];
+        ks    = ksw;
+        kb    = kbw;
+        ksw   = neighborFX[ksw];
+        kbw   = neighborFX[kbw];
+        kbs   = kbsw;
+        kbsw  = neighborFX[kbsw];
+        ////////////////////////////////////////////////////////////////////////////////
+        f_E    = feF[kzero];
+        f_W    = fwF[kw];
+        f_N    = fnF[kzero];
+        f_S    = fsF[ks];
+        f_T    = ftF[kzero];
+        f_B    = fbF[kb];
+        f_NE   = fneF[kzero];
+        f_SW   = fswF[ksw];
+        f_SE   = fseF[ks];
+        f_NW   = fnwF[kw];
+        f_TE   = fteF[kzero];
+        f_BW   = fbwF[kbw];
+        f_BE   = fbeF[kb];
+        f_TW   = ftwF[kw];
+        f_TN   = ftnF[kzero];
+        f_BS   = fbsF[kbs];
+        f_BN   = fbnF[kb];
+        f_TS   = ftsF[ks];
+        f_ZERO = fzeroF[kzero];
+        f_TNE  = ftneF[kzero];
+        f_TSW  = ftswF[ksw];
+        f_TSE  = ftseF[ks];
+        f_TNW  = ftnwF[kw];
+        f_BNE  = fbneF[kb];
+        f_BSW  = fbswF[kbsw];
+        f_BSE  = fbseF[kbs];
+        f_BNW  = fbnwF[kbw];
+
+        drho_SET = f_E + f_W + f_N + f_S + f_T + f_B + f_NE + f_SW + f_SE + f_NW + f_TE + f_BW + f_BE + f_TW + f_TN +
+                   f_BS + f_BN + f_TS + f_ZERO + f_TNE + f_TSW + f_TSE + f_TNW + f_BNE + f_BSW + f_BSE + f_BNW;
+        vx1_SET = (((f_TNE - f_BSW) + (f_TSE - f_BNW) + (f_BNE - f_TSW) + (f_BSE - f_TNW)) +
+                   (((f_NE - f_SW) + (f_TE - f_BW)) + ((f_SE - f_NW) + (f_BE - f_TW))) + (f_E - f_W)) /
+                  (c1o1 + drho_SET);
+        vx2_SET = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_BNE - f_TSW) + (f_BNW - f_TSE)) +
+                   (((f_NE - f_SW) + (f_TN - f_BS)) + ((f_BN - f_TS) + (f_NW - f_SE))) + (f_N - f_S)) /
+                  (c1o1 + drho_SET);
+        vx3_SET = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_TSE - f_BNW) + (f_TSW - f_BNE)) +
+                   (((f_TE - f_BW) + (f_TN - f_BS)) + ((f_TW - f_BE) + (f_TS - f_BN))) + (f_T - f_B)) /
+                  (c1o1 + drho_SET);
+
+        kxyFromfcNEQ_SET =
+            -c3o1 * omegaS *
+            ((f_SW + f_BSW + f_TSW - f_NW - f_BNW - f_TNW - f_SE - f_BSE - f_TSE + f_NE + f_BNE + f_TNE) /
+                 (c1o1 + drho_SET) -
+             ((vx1_SET * vx2_SET)));
+        kyzFromfcNEQ_SET =
+            -c3o1 * omegaS *
+            ((f_BS + f_BSE + f_BSW - f_TS - f_TSE - f_TSW - f_BN - f_BNE - f_BNW + f_TN + f_TNE + f_TNW) /
+                 (c1o1 + drho_SET) -
+             ((vx2_SET * vx3_SET)));
+        kxzFromfcNEQ_SET =
+            -c3o1 * omegaS *
+            ((f_BW + f_BSW + f_BNW - f_TW - f_TSW - f_TNW - f_BE - f_BSE - f_BNE + f_TE + f_TSE + f_TNE) /
+                 (c1o1 + drho_SET) -
+             ((vx1_SET * vx3_SET)));
+        kxxMyyFromfcNEQ_SET =
+            -c3o2 * omegaS *
+            ((f_BW + f_W + f_TW - f_BS - f_S - f_TS - f_BN - f_N - f_TN + f_BE + f_E + f_TE) / (c1o1 + drho_SET) -
+             ((vx1_SET * vx1_SET - vx2_SET * vx2_SET)));
+        kxxMzzFromfcNEQ_SET =
+            -c3o2 * omegaS *
+            ((f_SW + f_W + f_NW - f_BS - f_TS - f_B - f_T - f_BN - f_TN + f_SE + f_E + f_NE) / (c1o1 + drho_SET) -
+             ((vx1_SET * vx1_SET - vx3_SET * vx3_SET)));
+
+        //////////////////////////////////////////////////////////////////////////
+        // SEB//
+        //////////////////////////////////////////////////////////////////////////
+        // index
+        kb    = kzero;
+        kbw   = kw;
+        kbs   = ks;
+        kbsw  = ksw;
+        kzero = k0w;
+        kw    = neighborFX[k0w];
+        ks    = k0sw;
+        ksw   = neighborFX[k0sw];
+        ////////////////////////////////////////////////////////////////////////////////
+        f_E    = feF[kzero];
+        f_W    = fwF[kw];
+        f_N    = fnF[kzero];
+        f_S    = fsF[ks];
+        f_T    = ftF[kzero];
+        f_B    = fbF[kb];
+        f_NE   = fneF[kzero];
+        f_SW   = fswF[ksw];
+        f_SE   = fseF[ks];
+        f_NW   = fnwF[kw];
+        f_TE   = fteF[kzero];
+        f_BW   = fbwF[kbw];
+        f_BE   = fbeF[kb];
+        f_TW   = ftwF[kw];
+        f_TN   = ftnF[kzero];
+        f_BS   = fbsF[kbs];
+        f_BN   = fbnF[kb];
+        f_TS   = ftsF[ks];
+        f_ZERO = fzeroF[kzero];
+        f_TNE  = ftneF[kzero];
+        f_TSW  = ftswF[ksw];
+        f_TSE  = ftseF[ks];
+        f_TNW  = ftnwF[kw];
+        f_BNE  = fbneF[kb];
+        f_BSW  = fbswF[kbsw];
+        f_BSE  = fbseF[kbs];
+        f_BNW  = fbnwF[kbw];
+
+        drho_SEB = f_E + f_W + f_N + f_S + f_T + f_B + f_NE + f_SW + f_SE + f_NW + f_TE + f_BW + f_BE + f_TW + f_TN +
+                   f_BS + f_BN + f_TS + f_ZERO + f_TNE + f_TSW + f_TSE + f_TNW + f_BNE + f_BSW + f_BSE + f_BNW;
+        vx1_SEB = (((f_TNE - f_BSW) + (f_TSE - f_BNW) + (f_BNE - f_TSW) + (f_BSE - f_TNW)) +
+                   (((f_NE - f_SW) + (f_TE - f_BW)) + ((f_SE - f_NW) + (f_BE - f_TW))) + (f_E - f_W)) /
+                  (c1o1 + drho_SEB);
+        vx2_SEB = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_BNE - f_TSW) + (f_BNW - f_TSE)) +
+                   (((f_NE - f_SW) + (f_TN - f_BS)) + ((f_BN - f_TS) + (f_NW - f_SE))) + (f_N - f_S)) /
+                  (c1o1 + drho_SEB);
+        vx3_SEB = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_TSE - f_BNW) + (f_TSW - f_BNE)) +
+                   (((f_TE - f_BW) + (f_TN - f_BS)) + ((f_TW - f_BE) + (f_TS - f_BN))) + (f_T - f_B)) /
+                  (c1o1 + drho_SEB);
+
+        kxyFromfcNEQ_SEB =
+            -c3o1 * omegaS *
+            ((f_SW + f_BSW + f_TSW - f_NW - f_BNW - f_TNW - f_SE - f_BSE - f_TSE + f_NE + f_BNE + f_TNE) /
+                 (c1o1 + drho_SEB) -
+             ((vx1_SEB * vx2_SEB)));
+        kyzFromfcNEQ_SEB =
+            -c3o1 * omegaS *
+            ((f_BS + f_BSE + f_BSW - f_TS - f_TSE - f_TSW - f_BN - f_BNE - f_BNW + f_TN + f_TNE + f_TNW) /
+                 (c1o1 + drho_SEB) -
+             ((vx2_SEB * vx3_SEB)));
+        kxzFromfcNEQ_SEB =
+            -c3o1 * omegaS *
+            ((f_BW + f_BSW + f_BNW - f_TW - f_TSW - f_TNW - f_BE - f_BSE - f_BNE + f_TE + f_TSE + f_TNE) /
+                 (c1o1 + drho_SEB) -
+             ((vx1_SEB * vx3_SEB)));
+        kxxMyyFromfcNEQ_SEB =
+            -c3o2 * omegaS *
+            ((f_BW + f_W + f_TW - f_BS - f_S - f_TS - f_BN - f_N - f_TN + f_BE + f_E + f_TE) / (c1o1 + drho_SEB) -
+             ((vx1_SEB * vx1_SEB - vx2_SEB * vx2_SEB)));
+        kxxMzzFromfcNEQ_SEB =
+            -c3o2 * omegaS *
+            ((f_SW + f_W + f_NW - f_BS - f_TS - f_B - f_T - f_BN - f_TN + f_SE + f_E + f_NE) / (c1o1 + drho_SEB) -
+             ((vx1_SEB * vx1_SEB - vx3_SEB * vx3_SEB)));
+
+        //////////////////////////////////////////////////////////////////////////
+        // NWB//
+        //////////////////////////////////////////////////////////////////////////
+        // index 0
+        k0zero = k0s;
+        k0w    = k0sw;
+        k0s    = neighborFY[k0s];
+        k0b    = k0bs;
+        k0sw   = neighborFY[k0sw];
+        k0bw   = k0bsw;
+        k0bs   = neighborFY[k0bs];
+        k0bsw  = neighborFY[k0bsw];
+        //////////////////////////////////////////////////////////////////////////
+        // index
+        kzero = k0zero;
+        kw    = k0w;
+        ks    = k0s;
+        kb    = k0b;
+        ksw   = k0sw;
+        kbw   = k0bw;
+        kbs   = k0bs;
+        kbsw  = k0bsw;
+        ////////////////////////////////////////////////////////////////////////////////
+        f_E    = feF[kzero];
+        f_W    = fwF[kw];
+        f_N    = fnF[kzero];
+        f_S    = fsF[ks];
+        f_T    = ftF[kzero];
+        f_B    = fbF[kb];
+        f_NE   = fneF[kzero];
+        f_SW   = fswF[ksw];
+        f_SE   = fseF[ks];
+        f_NW   = fnwF[kw];
+        f_TE   = fteF[kzero];
+        f_BW   = fbwF[kbw];
+        f_BE   = fbeF[kb];
+        f_TW   = ftwF[kw];
+        f_TN   = ftnF[kzero];
+        f_BS   = fbsF[kbs];
+        f_BN   = fbnF[kb];
+        f_TS   = ftsF[ks];
+        f_ZERO = fzeroF[kzero];
+        f_TNE  = ftneF[kzero];
+        f_TSW  = ftswF[ksw];
+        f_TSE  = ftseF[ks];
+        f_TNW  = ftnwF[kw];
+        f_BNE  = fbneF[kb];
+        f_BSW  = fbswF[kbsw];
+        f_BSE  = fbseF[kbs];
+        f_BNW  = fbnwF[kbw];
+
+        drho_NWB = f_E + f_W + f_N + f_S + f_T + f_B + f_NE + f_SW + f_SE + f_NW + f_TE + f_BW + f_BE + f_TW + f_TN +
+                   f_BS + f_BN + f_TS + f_ZERO + f_TNE + f_TSW + f_TSE + f_TNW + f_BNE + f_BSW + f_BSE + f_BNW;
+        vx1_NWB = (((f_TNE - f_BSW) + (f_TSE - f_BNW) + (f_BNE - f_TSW) + (f_BSE - f_TNW)) +
+                   (((f_NE - f_SW) + (f_TE - f_BW)) + ((f_SE - f_NW) + (f_BE - f_TW))) + (f_E - f_W)) /
+                  (c1o1 + drho_NWB);
+        vx2_NWB = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_BNE - f_TSW) + (f_BNW - f_TSE)) +
+                   (((f_NE - f_SW) + (f_TN - f_BS)) + ((f_BN - f_TS) + (f_NW - f_SE))) + (f_N - f_S)) /
+                  (c1o1 + drho_NWB);
+        vx3_NWB = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_TSE - f_BNW) + (f_TSW - f_BNE)) +
+                   (((f_TE - f_BW) + (f_TN - f_BS)) + ((f_TW - f_BE) + (f_TS - f_BN))) + (f_T - f_B)) /
+                  (c1o1 + drho_NWB);
+
+        kxyFromfcNEQ_NWB =
+            -c3o1 * omegaS *
+            ((f_SW + f_BSW + f_TSW - f_NW - f_BNW - f_TNW - f_SE - f_BSE - f_TSE + f_NE + f_BNE + f_TNE) /
+                 (c1o1 + drho_NWB) -
+             ((vx1_NWB * vx2_NWB)));
+        kyzFromfcNEQ_NWB =
+            -c3o1 * omegaS *
+            ((f_BS + f_BSE + f_BSW - f_TS - f_TSE - f_TSW - f_BN - f_BNE - f_BNW + f_TN + f_TNE + f_TNW) /
+                 (c1o1 + drho_NWB) -
+             ((vx2_NWB * vx3_NWB)));
+        kxzFromfcNEQ_NWB =
+            -c3o1 * omegaS *
+            ((f_BW + f_BSW + f_BNW - f_TW - f_TSW - f_TNW - f_BE - f_BSE - f_BNE + f_TE + f_TSE + f_TNE) /
+                 (c1o1 + drho_NWB) -
+             ((vx1_NWB * vx3_NWB)));
+        kxxMyyFromfcNEQ_NWB =
+            -c3o2 * omegaS *
+            ((f_BW + f_W + f_TW - f_BS - f_S - f_TS - f_BN - f_N - f_TN + f_BE + f_E + f_TE) / (c1o1 + drho_NWB) -
+             ((vx1_NWB * vx1_NWB - vx2_NWB * vx2_NWB)));
+        kxxMzzFromfcNEQ_NWB =
+            -c3o2 * omegaS *
+            ((f_SW + f_W + f_NW - f_BS - f_TS - f_B - f_T - f_BN - f_TN + f_SE + f_E + f_NE) / (c1o1 + drho_NWB) -
+             ((vx1_NWB * vx1_NWB - vx3_NWB * vx3_NWB)));
+
+        //////////////////////////////////////////////////////////////////////////
+        // NWT//
+        //////////////////////////////////////////////////////////////////////////
+        // index
+        kzero = kb;
+        kw    = kbw;
+        ks    = kbs;
+        kb    = neighborFZ[kb];
+        ksw   = kbsw;
+        kbw   = neighborFZ[kbw];
+        kbs   = neighborFZ[kbs];
+        kbsw  = neighborFZ[kbsw];
+        ////////////////////////////////////////////////////////////////////////////////
+        f_E    = feF[kzero];
+        f_W    = fwF[kw];
+        f_N    = fnF[kzero];
+        f_S    = fsF[ks];
+        f_T    = ftF[kzero];
+        f_B    = fbF[kb];
+        f_NE   = fneF[kzero];
+        f_SW   = fswF[ksw];
+        f_SE   = fseF[ks];
+        f_NW   = fnwF[kw];
+        f_TE   = fteF[kzero];
+        f_BW   = fbwF[kbw];
+        f_BE   = fbeF[kb];
+        f_TW   = ftwF[kw];
+        f_TN   = ftnF[kzero];
+        f_BS   = fbsF[kbs];
+        f_BN   = fbnF[kb];
+        f_TS   = ftsF[ks];
+        f_ZERO = fzeroF[kzero];
+        f_TNE  = ftneF[kzero];
+        f_TSW  = ftswF[ksw];
+        f_TSE  = ftseF[ks];
+        f_TNW  = ftnwF[kw];
+        f_BNE  = fbneF[kb];
+        f_BSW  = fbswF[kbsw];
+        f_BSE  = fbseF[kbs];
+        f_BNW  = fbnwF[kbw];
+
+        drho_NWT = f_E + f_W + f_N + f_S + f_T + f_B + f_NE + f_SW + f_SE + f_NW + f_TE + f_BW + f_BE + f_TW + f_TN +
+                   f_BS + f_BN + f_TS + f_ZERO + f_TNE + f_TSW + f_TSE + f_TNW + f_BNE + f_BSW + f_BSE + f_BNW;
+        vx1_NWT = (((f_TNE - f_BSW) + (f_TSE - f_BNW) + (f_BNE - f_TSW) + (f_BSE - f_TNW)) +
+                   (((f_NE - f_SW) + (f_TE - f_BW)) + ((f_SE - f_NW) + (f_BE - f_TW))) + (f_E - f_W)) /
+                  (c1o1 + drho_NWT);
+        vx2_NWT = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_BNE - f_TSW) + (f_BNW - f_TSE)) +
+                   (((f_NE - f_SW) + (f_TN - f_BS)) + ((f_BN - f_TS) + (f_NW - f_SE))) + (f_N - f_S)) /
+                  (c1o1 + drho_NWT);
+        vx3_NWT = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_TSE - f_BNW) + (f_TSW - f_BNE)) +
+                   (((f_TE - f_BW) + (f_TN - f_BS)) + ((f_TW - f_BE) + (f_TS - f_BN))) + (f_T - f_B)) /
+                  (c1o1 + drho_NWT);
+
+        kxyFromfcNEQ_NWT =
+            -c3o1 * omegaS *
+            ((f_SW + f_BSW + f_TSW - f_NW - f_BNW - f_TNW - f_SE - f_BSE - f_TSE + f_NE + f_BNE + f_TNE) /
+                 (c1o1 + drho_NWT) -
+             ((vx1_NWT * vx2_NWT)));
+        kyzFromfcNEQ_NWT =
+            -c3o1 * omegaS *
+            ((f_BS + f_BSE + f_BSW - f_TS - f_TSE - f_TSW - f_BN - f_BNE - f_BNW + f_TN + f_TNE + f_TNW) /
+                 (c1o1 + drho_NWT) -
+             ((vx2_NWT * vx3_NWT)));
+        kxzFromfcNEQ_NWT =
+            -c3o1 * omegaS *
+            ((f_BW + f_BSW + f_BNW - f_TW - f_TSW - f_TNW - f_BE - f_BSE - f_BNE + f_TE + f_TSE + f_TNE) /
+                 (c1o1 + drho_NWT) -
+             ((vx1_NWT * vx3_NWT)));
+        kxxMyyFromfcNEQ_NWT =
+            -c3o2 * omegaS *
+            ((f_BW + f_W + f_TW - f_BS - f_S - f_TS - f_BN - f_N - f_TN + f_BE + f_E + f_TE) / (c1o1 + drho_NWT) -
+             ((vx1_NWT * vx1_NWT - vx2_NWT * vx2_NWT)));
+        kxxMzzFromfcNEQ_NWT =
+            -c3o2 * omegaS *
+            ((f_SW + f_W + f_NW - f_BS - f_TS - f_B - f_T - f_BN - f_TN + f_SE + f_E + f_NE) / (c1o1 + drho_NWT) -
+             ((vx1_NWT * vx1_NWT - vx3_NWT * vx3_NWT)));
+
+        //////////////////////////////////////////////////////////////////////////
+        // NET//
+        //////////////////////////////////////////////////////////////////////////
+        // index
+        kzero = kw;
+        kw    = neighborFX[kw];
+        ks    = ksw;
+        kb    = kbw;
+        ksw   = neighborFX[ksw];
+        kbw   = neighborFX[kbw];
+        kbs   = kbsw;
+        kbsw  = neighborFX[kbsw];
+        ////////////////////////////////////////////////////////////////////////////////
+        f_E    = feF[kzero];
+        f_W    = fwF[kw];
+        f_N    = fnF[kzero];
+        f_S    = fsF[ks];
+        f_T    = ftF[kzero];
+        f_B    = fbF[kb];
+        f_NE   = fneF[kzero];
+        f_SW   = fswF[ksw];
+        f_SE   = fseF[ks];
+        f_NW   = fnwF[kw];
+        f_TE   = fteF[kzero];
+        f_BW   = fbwF[kbw];
+        f_BE   = fbeF[kb];
+        f_TW   = ftwF[kw];
+        f_TN   = ftnF[kzero];
+        f_BS   = fbsF[kbs];
+        f_BN   = fbnF[kb];
+        f_TS   = ftsF[ks];
+        f_ZERO = fzeroF[kzero];
+        f_TNE  = ftneF[kzero];
+        f_TSW  = ftswF[ksw];
+        f_TSE  = ftseF[ks];
+        f_TNW  = ftnwF[kw];
+        f_BNE  = fbneF[kb];
+        f_BSW  = fbswF[kbsw];
+        f_BSE  = fbseF[kbs];
+        f_BNW  = fbnwF[kbw];
+
+        drho_NET = f_E + f_W + f_N + f_S + f_T + f_B + f_NE + f_SW + f_SE + f_NW + f_TE + f_BW + f_BE + f_TW + f_TN +
+                   f_BS + f_BN + f_TS + f_ZERO + f_TNE + f_TSW + f_TSE + f_TNW + f_BNE + f_BSW + f_BSE + f_BNW;
+        vx1_NET = (((f_TNE - f_BSW) + (f_TSE - f_BNW) + (f_BNE - f_TSW) + (f_BSE - f_TNW)) +
+                   (((f_NE - f_SW) + (f_TE - f_BW)) + ((f_SE - f_NW) + (f_BE - f_TW))) + (f_E - f_W)) /
+                  (c1o1 + drho_NET);
+        vx2_NET = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_BNE - f_TSW) + (f_BNW - f_TSE)) +
+                   (((f_NE - f_SW) + (f_TN - f_BS)) + ((f_BN - f_TS) + (f_NW - f_SE))) + (f_N - f_S)) /
+                  (c1o1 + drho_NET);
+        vx3_NET = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_TSE - f_BNW) + (f_TSW - f_BNE)) +
+                   (((f_TE - f_BW) + (f_TN - f_BS)) + ((f_TW - f_BE) + (f_TS - f_BN))) + (f_T - f_B)) /
+                  (c1o1 + drho_NET);
+
+        kxyFromfcNEQ_NET =
+            -c3o1 * omegaS *
+            ((f_SW + f_BSW + f_TSW - f_NW - f_BNW - f_TNW - f_SE - f_BSE - f_TSE + f_NE + f_BNE + f_TNE) /
+                 (c1o1 + drho_NET) -
+             ((vx1_NET * vx2_NET)));
+        kyzFromfcNEQ_NET =
+            -c3o1 * omegaS *
+            ((f_BS + f_BSE + f_BSW - f_TS - f_TSE - f_TSW - f_BN - f_BNE - f_BNW + f_TN + f_TNE + f_TNW) /
+                 (c1o1 + drho_NET) -
+             ((vx2_NET * vx3_NET)));
+        kxzFromfcNEQ_NET =
+            -c3o1 * omegaS *
+            ((f_BW + f_BSW + f_BNW - f_TW - f_TSW - f_TNW - f_BE - f_BSE - f_BNE + f_TE + f_TSE + f_TNE) /
+                 (c1o1 + drho_NET) -
+             ((vx1_NET * vx3_NET)));
+        kxxMyyFromfcNEQ_NET =
+            -c3o2 * omegaS *
+            ((f_BW + f_W + f_TW - f_BS - f_S - f_TS - f_BN - f_N - f_TN + f_BE + f_E + f_TE) / (c1o1 + drho_NET) -
+             ((vx1_NET * vx1_NET - vx2_NET * vx2_NET)));
+        kxxMzzFromfcNEQ_NET =
+            -c3o2 * omegaS *
+            ((f_SW + f_W + f_NW - f_BS - f_TS - f_B - f_T - f_BN - f_TN + f_SE + f_E + f_NE) / (c1o1 + drho_NET) -
+             ((vx1_NET * vx1_NET - vx3_NET * vx3_NET)));
+
+        //////////////////////////////////////////////////////////////////////////
+        // NEB//
+        //////////////////////////////////////////////////////////////////////////
+        // index
+        kb    = kzero;
+        kbw   = kw;
+        kbs   = ks;
+        kbsw  = ksw;
+        kzero = k0w;
+        kw    = neighborFX[k0w];
+        ks    = k0sw;
+        ksw   = neighborFX[k0sw];
+        ////////////////////////////////////////////////////////////////////////////////
+        f_E    = feF[kzero];
+        f_W    = fwF[kw];
+        f_N    = fnF[kzero];
+        f_S    = fsF[ks];
+        f_T    = ftF[kzero];
+        f_B    = fbF[kb];
+        f_NE   = fneF[kzero];
+        f_SW   = fswF[ksw];
+        f_SE   = fseF[ks];
+        f_NW   = fnwF[kw];
+        f_TE   = fteF[kzero];
+        f_BW   = fbwF[kbw];
+        f_BE   = fbeF[kb];
+        f_TW   = ftwF[kw];
+        f_TN   = ftnF[kzero];
+        f_BS   = fbsF[kbs];
+        f_BN   = fbnF[kb];
+        f_TS   = ftsF[ks];
+        f_ZERO = fzeroF[kzero];
+        f_TNE  = ftneF[kzero];
+        f_TSW  = ftswF[ksw];
+        f_TSE  = ftseF[ks];
+        f_TNW  = ftnwF[kw];
+        f_BNE  = fbneF[kb];
+        f_BSW  = fbswF[kbsw];
+        f_BSE  = fbseF[kbs];
+        f_BNW  = fbnwF[kbw];
+
+        drho_NEB = f_E + f_W + f_N + f_S + f_T + f_B + f_NE + f_SW + f_SE + f_NW + f_TE + f_BW + f_BE + f_TW + f_TN +
+                   f_BS + f_BN + f_TS + f_ZERO + f_TNE + f_TSW + f_TSE + f_TNW + f_BNE + f_BSW + f_BSE + f_BNW;
+        vx1_NEB = (((f_TNE - f_BSW) + (f_TSE - f_BNW) + (f_BNE - f_TSW) + (f_BSE - f_TNW)) +
+                   (((f_NE - f_SW) + (f_TE - f_BW)) + ((f_SE - f_NW) + (f_BE - f_TW))) + (f_E - f_W)) /
+                  (c1o1 + drho_NEB);
+        vx2_NEB = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_BNE - f_TSW) + (f_BNW - f_TSE)) +
+                   (((f_NE - f_SW) + (f_TN - f_BS)) + ((f_BN - f_TS) + (f_NW - f_SE))) + (f_N - f_S)) /
+                  (c1o1 + drho_NEB);
+        vx3_NEB = (((f_TNE - f_BSW) + (f_TNW - f_BSE) + (f_TSE - f_BNW) + (f_TSW - f_BNE)) +
+                   (((f_TE - f_BW) + (f_TN - f_BS)) + ((f_TW - f_BE) + (f_TS - f_BN))) + (f_T - f_B)) /
+                  (c1o1 + drho_NEB);
+
+        kxyFromfcNEQ_NEB =
+            -c3o1 * omegaS *
+            ((f_SW + f_BSW + f_TSW - f_NW - f_BNW - f_TNW - f_SE - f_BSE - f_TSE + f_NE + f_BNE + f_TNE) /
+                 (c1o1 + drho_NEB) -
+             ((vx1_NEB * vx2_NEB)));
+        kyzFromfcNEQ_NEB =
+            -c3o1 * omegaS *
+            ((f_BS + f_BSE + f_BSW - f_TS - f_TSE - f_TSW - f_BN - f_BNE - f_BNW + f_TN + f_TNE + f_TNW) /
+                 (c1o1 + drho_NEB) -
+             ((vx2_NEB * vx3_NEB)));
+        kxzFromfcNEQ_NEB =
+            -c3o1 * omegaS *
+            ((f_BW + f_BSW + f_BNW - f_TW - f_TSW - f_TNW - f_BE - f_BSE - f_BNE + f_TE + f_TSE + f_TNE) /
+                 (c1o1 + drho_NEB) -
+             ((vx1_NEB * vx3_NEB)));
+        kxxMyyFromfcNEQ_NEB =
+            -c3o2 * omegaS *
+            ((f_BW + f_W + f_TW - f_BS - f_S - f_TS - f_BN - f_N - f_TN + f_BE + f_E + f_TE) / (c1o1 + drho_NEB) -
+             ((vx1_NEB * vx1_NEB - vx2_NEB * vx2_NEB)));
+        kxxMzzFromfcNEQ_NEB =
+            -c3o2 * omegaS *
+            ((f_SW + f_W + f_NW - f_BS - f_TS - f_B - f_T - f_BN - f_TN + f_SE + f_E + f_NE) / (c1o1 + drho_NEB) -
+             ((vx1_NEB * vx1_NEB - vx3_NEB * vx3_NEB)));
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // kxyFromfcNEQ_SWB    = zero;
+        // kyzFromfcNEQ_SWB    = zero;
+        // kxzFromfcNEQ_SWB    = zero;
+        // kxxMyyFromfcNEQ_SWB = zero;
+        // kxxMzzFromfcNEQ_SWB = zero;
+        // kxyFromfcNEQ_SWT    = zero;
+        // kyzFromfcNEQ_SWT    = zero;
+        // kxzFromfcNEQ_SWT    = zero;
+        // kxxMyyFromfcNEQ_SWT = zero;
+        // kxxMzzFromfcNEQ_SWT = zero;
+        // kxyFromfcNEQ_SET    = zero;
+        // kyzFromfcNEQ_SET    = zero;
+        // kxzFromfcNEQ_SET    = zero;
+        // kxxMyyFromfcNEQ_SET = zero;
+        // kxxMzzFromfcNEQ_SET = zero;
+        // kxyFromfcNEQ_SEB    = zero;
+        // kyzFromfcNEQ_SEB    = zero;
+        // kxzFromfcNEQ_SEB    = zero;
+        // kxxMyyFromfcNEQ_SEB = zero;
+        // kxxMzzFromfcNEQ_SEB = zero;
+        // kxyFromfcNEQ_NWB    = zero;
+        // kyzFromfcNEQ_NWB    = zero;
+        // kxzFromfcNEQ_NWB    = zero;
+        // kxxMyyFromfcNEQ_NWB = zero;
+        // kxxMzzFromfcNEQ_NWB = zero;
+        // kxyFromfcNEQ_NWT    = zero;
+        // kyzFromfcNEQ_NWT    = zero;
+        // kxzFromfcNEQ_NWT    = zero;
+        // kxxMyyFromfcNEQ_NWT = zero;
+        // kxxMzzFromfcNEQ_NWT = zero;
+        // kxyFromfcNEQ_NET    = zero;
+        // kyzFromfcNEQ_NET    = zero;
+        // kxzFromfcNEQ_NET    = zero;
+        // kxxMyyFromfcNEQ_NET = zero;
+        // kxxMzzFromfcNEQ_NET = zero;
+        // kxyFromfcNEQ_NEB    = zero;
+        // kyzFromfcNEQ_NEB    = zero;
+        // kxzFromfcNEQ_NEB    = zero;
+        // kxxMyyFromfcNEQ_NEB = zero;
+        // kxxMzzFromfcNEQ_NEB = zero;
+        //////////////////////////////////////////////////////////////////////////
+        // 3
+        //////////////////////////////////////////////////////////////////////////
+        a0 = (-kxxMyyFromfcNEQ_NEB - kxxMyyFromfcNEQ_NET + kxxMyyFromfcNEQ_NWB + kxxMyyFromfcNEQ_NWT -
+              kxxMyyFromfcNEQ_SEB - kxxMyyFromfcNEQ_SET + kxxMyyFromfcNEQ_SWB + kxxMyyFromfcNEQ_SWT -
+              kxxMzzFromfcNEQ_NEB - kxxMzzFromfcNEQ_NET + kxxMzzFromfcNEQ_NWB + kxxMzzFromfcNEQ_NWT -
+              kxxMzzFromfcNEQ_SEB - kxxMzzFromfcNEQ_SET + kxxMzzFromfcNEQ_SWB + kxxMzzFromfcNEQ_SWT -
+              c2o1 * kxyFromfcNEQ_NEB - c2o1 * kxyFromfcNEQ_NET - c2o1 * kxyFromfcNEQ_NWB - c2o1 * kxyFromfcNEQ_NWT +
+              c2o1 * kxyFromfcNEQ_SEB + c2o1 * kxyFromfcNEQ_SET + c2o1 * kxyFromfcNEQ_SWB + c2o1 * kxyFromfcNEQ_SWT +
+              c2o1 * kxzFromfcNEQ_NEB - c2o1 * kxzFromfcNEQ_NET + c2o1 * kxzFromfcNEQ_NWB - c2o1 * kxzFromfcNEQ_NWT +
+              c2o1 * kxzFromfcNEQ_SEB - c2o1 * kxzFromfcNEQ_SET + c2o1 * kxzFromfcNEQ_SWB - c2o1 * kxzFromfcNEQ_SWT +
+              c8o1 * vx1_NEB + c8o1 * vx1_NET + c8o1 * vx1_NWB + c8o1 * vx1_NWT + c8o1 * vx1_SEB + c8o1 * vx1_SET +
+              c8o1 * vx1_SWB + c8o1 * vx1_SWT + c2o1 * vx2_NEB + c2o1 * vx2_NET - c2o1 * vx2_NWB - c2o1 * vx2_NWT -
+              c2o1 * vx2_SEB - c2o1 * vx2_SET + c2o1 * vx2_SWB + c2o1 * vx2_SWT - c2o1 * vx3_NEB + c2o1 * vx3_NET +
+              c2o1 * vx3_NWB - c2o1 * vx3_NWT - c2o1 * vx3_SEB + c2o1 * vx3_SET + c2o1 * vx3_SWB - c2o1 * vx3_SWT) /
+             c64o1;
+        b0 = (c2o1 * kxxMyyFromfcNEQ_NEB + c2o1 * kxxMyyFromfcNEQ_NET + c2o1 * kxxMyyFromfcNEQ_NWB +
+              c2o1 * kxxMyyFromfcNEQ_NWT - c2o1 * kxxMyyFromfcNEQ_SEB - c2o1 * kxxMyyFromfcNEQ_SET -
+              c2o1 * kxxMyyFromfcNEQ_SWB - c2o1 * kxxMyyFromfcNEQ_SWT - kxxMzzFromfcNEQ_NEB - kxxMzzFromfcNEQ_NET -
+              kxxMzzFromfcNEQ_NWB - kxxMzzFromfcNEQ_NWT + kxxMzzFromfcNEQ_SEB + kxxMzzFromfcNEQ_SET +
+              kxxMzzFromfcNEQ_SWB + kxxMzzFromfcNEQ_SWT - c2o1 * kxyFromfcNEQ_NEB - c2o1 * kxyFromfcNEQ_NET +
+              c2o1 * kxyFromfcNEQ_NWB + c2o1 * kxyFromfcNEQ_NWT - c2o1 * kxyFromfcNEQ_SEB - c2o1 * kxyFromfcNEQ_SET +
+              c2o1 * kxyFromfcNEQ_SWB + c2o1 * kxyFromfcNEQ_SWT + c2o1 * kyzFromfcNEQ_NEB - c2o1 * kyzFromfcNEQ_NET +
+              c2o1 * kyzFromfcNEQ_NWB - c2o1 * kyzFromfcNEQ_NWT + c2o1 * kyzFromfcNEQ_SEB - c2o1 * kyzFromfcNEQ_SET +
+              c2o1 * kyzFromfcNEQ_SWB - c2o1 * kyzFromfcNEQ_SWT + c2o1 * vx1_NEB + c2o1 * vx1_NET - c2o1 * vx1_NWB -
+              c2o1 * vx1_NWT - c2o1 * vx1_SEB - c2o1 * vx1_SET + c2o1 * vx1_SWB + c2o1 * vx1_SWT + c8o1 * vx2_NEB +
+              c8o1 * vx2_NET + c8o1 * vx2_NWB + c8o1 * vx2_NWT + c8o1 * vx2_SEB + c8o1 * vx2_SET + c8o1 * vx2_SWB +
+              c8o1 * vx2_SWT - c2o1 * vx3_NEB + c2o1 * vx3_NET - c2o1 * vx3_NWB + c2o1 * vx3_NWT + c2o1 * vx3_SEB -
+              c2o1 * vx3_SET + c2o1 * vx3_SWB - c2o1 * vx3_SWT) /
+             c64o1;
+        c0 = (kxxMyyFromfcNEQ_NEB - kxxMyyFromfcNEQ_NET + kxxMyyFromfcNEQ_NWB - kxxMyyFromfcNEQ_NWT +
+              kxxMyyFromfcNEQ_SEB - kxxMyyFromfcNEQ_SET + kxxMyyFromfcNEQ_SWB - kxxMyyFromfcNEQ_SWT -
+              c2o1 * kxxMzzFromfcNEQ_NEB + c2o1 * kxxMzzFromfcNEQ_NET - c2o1 * kxxMzzFromfcNEQ_NWB +
+              c2o1 * kxxMzzFromfcNEQ_NWT - c2o1 * kxxMzzFromfcNEQ_SEB + c2o1 * kxxMzzFromfcNEQ_SET -
+              c2o1 * kxxMzzFromfcNEQ_SWB + c2o1 * kxxMzzFromfcNEQ_SWT - c2o1 * kxzFromfcNEQ_NEB -
+              c2o1 * kxzFromfcNEQ_NET + c2o1 * kxzFromfcNEQ_NWB + c2o1 * kxzFromfcNEQ_NWT - c2o1 * kxzFromfcNEQ_SEB -
+              c2o1 * kxzFromfcNEQ_SET + c2o1 * kxzFromfcNEQ_SWB + c2o1 * kxzFromfcNEQ_SWT - c2o1 * kyzFromfcNEQ_NEB -
+              c2o1 * kyzFromfcNEQ_NET - c2o1 * kyzFromfcNEQ_NWB - c2o1 * kyzFromfcNEQ_NWT + c2o1 * kyzFromfcNEQ_SEB +
+              c2o1 * kyzFromfcNEQ_SET + c2o1 * kyzFromfcNEQ_SWB + c2o1 * kyzFromfcNEQ_SWT - c2o1 * vx1_NEB +
+              c2o1 * vx1_NET + c2o1 * vx1_NWB - c2o1 * vx1_NWT - c2o1 * vx1_SEB + c2o1 * vx1_SET + c2o1 * vx1_SWB -
+              c2o1 * vx1_SWT - c2o1 * vx2_NEB + c2o1 * vx2_NET - c2o1 * vx2_NWB + c2o1 * vx2_NWT + c2o1 * vx2_SEB -
+              c2o1 * vx2_SET + c2o1 * vx2_SWB - c2o1 * vx2_SWT + c8o1 * vx3_NEB + c8o1 * vx3_NET + c8o1 * vx3_NWB +
+              c8o1 * vx3_NWT + c8o1 * vx3_SEB + c8o1 * vx3_SET + c8o1 * vx3_SWB + c8o1 * vx3_SWT) /
+             c64o1;
+        ax  = (vx1_NEB + vx1_NET - vx1_NWB - vx1_NWT + vx1_SEB + vx1_SET - vx1_SWB - vx1_SWT) / c4o1;
+        bx  = (vx2_NEB + vx2_NET - vx2_NWB - vx2_NWT + vx2_SEB + vx2_SET - vx2_SWB - vx2_SWT) / c4o1;
+        cx  = (vx3_NEB + vx3_NET - vx3_NWB - vx3_NWT + vx3_SEB + vx3_SET - vx3_SWB - vx3_SWT) / c4o1;
+        axx = (kxxMyyFromfcNEQ_NEB + kxxMyyFromfcNEQ_NET - kxxMyyFromfcNEQ_NWB - kxxMyyFromfcNEQ_NWT +
+               kxxMyyFromfcNEQ_SEB + kxxMyyFromfcNEQ_SET - kxxMyyFromfcNEQ_SWB - kxxMyyFromfcNEQ_SWT +
+               kxxMzzFromfcNEQ_NEB + kxxMzzFromfcNEQ_NET - kxxMzzFromfcNEQ_NWB - kxxMzzFromfcNEQ_NWT +
+               kxxMzzFromfcNEQ_SEB + kxxMzzFromfcNEQ_SET - kxxMzzFromfcNEQ_SWB - kxxMzzFromfcNEQ_SWT + c2o1 * vx2_NEB +
+               c2o1 * vx2_NET - c2o1 * vx2_NWB - c2o1 * vx2_NWT - c2o1 * vx2_SEB - c2o1 * vx2_SET + c2o1 * vx2_SWB +
+               c2o1 * vx2_SWT - c2o1 * vx3_NEB + c2o1 * vx3_NET + c2o1 * vx3_NWB - c2o1 * vx3_NWT - c2o1 * vx3_SEB +
+               c2o1 * vx3_SET + c2o1 * vx3_SWB - c2o1 * vx3_SWT) /
+              c16o1;
+        bxx = (kxyFromfcNEQ_NEB + kxyFromfcNEQ_NET - kxyFromfcNEQ_NWB - kxyFromfcNEQ_NWT + kxyFromfcNEQ_SEB +
+               kxyFromfcNEQ_SET - kxyFromfcNEQ_SWB - kxyFromfcNEQ_SWT - c2o1 * vx1_NEB - c2o1 * vx1_NET +
+               c2o1 * vx1_NWB + c2o1 * vx1_NWT + c2o1 * vx1_SEB + c2o1 * vx1_SET - c2o1 * vx1_SWB - c2o1 * vx1_SWT) /
+              c8o1;
+        cxx = (kxzFromfcNEQ_NEB + kxzFromfcNEQ_NET - kxzFromfcNEQ_NWB - kxzFromfcNEQ_NWT + kxzFromfcNEQ_SEB +
+               kxzFromfcNEQ_SET - kxzFromfcNEQ_SWB - kxzFromfcNEQ_SWT + c2o1 * vx1_NEB - c2o1 * vx1_NET -
+               c2o1 * vx1_NWB + c2o1 * vx1_NWT + c2o1 * vx1_SEB - c2o1 * vx1_SET - c2o1 * vx1_SWB + c2o1 * vx1_SWT) /
+              c8o1;
+        ay  = (vx1_NEB + vx1_NET + vx1_NWB + vx1_NWT - vx1_SEB - vx1_SET - vx1_SWB - vx1_SWT) / c4o1;
+        by  = (vx2_NEB + vx2_NET + vx2_NWB + vx2_NWT - vx2_SEB - vx2_SET - vx2_SWB - vx2_SWT) / c4o1;
+        cy  = (vx3_NEB + vx3_NET + vx3_NWB + vx3_NWT - vx3_SEB - vx3_SET - vx3_SWB - vx3_SWT) / c4o1;
+        ayy = (kxyFromfcNEQ_NEB + kxyFromfcNEQ_NET + kxyFromfcNEQ_NWB + kxyFromfcNEQ_NWT - kxyFromfcNEQ_SEB -
+               kxyFromfcNEQ_SET - kxyFromfcNEQ_SWB - kxyFromfcNEQ_SWT - c2o1 * vx2_NEB - c2o1 * vx2_NET +
+               c2o1 * vx2_NWB + c2o1 * vx2_NWT + c2o1 * vx2_SEB + c2o1 * vx2_SET - c2o1 * vx2_SWB - c2o1 * vx2_SWT) /
+              c8o1;
+        byy = (-c2o1 * kxxMyyFromfcNEQ_NEB - c2o1 * kxxMyyFromfcNEQ_NET - c2o1 * kxxMyyFromfcNEQ_NWB -
+               c2o1 * kxxMyyFromfcNEQ_NWT + c2o1 * kxxMyyFromfcNEQ_SEB + c2o1 * kxxMyyFromfcNEQ_SET +
+               c2o1 * kxxMyyFromfcNEQ_SWB + c2o1 * kxxMyyFromfcNEQ_SWT + kxxMzzFromfcNEQ_NEB + kxxMzzFromfcNEQ_NET +
+               kxxMzzFromfcNEQ_NWB + kxxMzzFromfcNEQ_NWT - kxxMzzFromfcNEQ_SEB - kxxMzzFromfcNEQ_SET -
+               kxxMzzFromfcNEQ_SWB - kxxMzzFromfcNEQ_SWT + c2o1 * vx1_NEB + c2o1 * vx1_NET - c2o1 * vx1_NWB -
+               c2o1 * vx1_NWT - c2o1 * vx1_SEB - c2o1 * vx1_SET + c2o1 * vx1_SWB + c2o1 * vx1_SWT - c2o1 * vx3_NEB +
+               c2o1 * vx3_NET - c2o1 * vx3_NWB + c2o1 * vx3_NWT + c2o1 * vx3_SEB - c2o1 * vx3_SET + c2o1 * vx3_SWB -
+               c2o1 * vx3_SWT) /
+              c16o1;
+        cyy = (kyzFromfcNEQ_NEB + kyzFromfcNEQ_NET + kyzFromfcNEQ_NWB + kyzFromfcNEQ_NWT - kyzFromfcNEQ_SEB -
+               kyzFromfcNEQ_SET - kyzFromfcNEQ_SWB - kyzFromfcNEQ_SWT + c2o1 * vx2_NEB - c2o1 * vx2_NET +
+               c2o1 * vx2_NWB - c2o1 * vx2_NWT - c2o1 * vx2_SEB + c2o1 * vx2_SET - c2o1 * vx2_SWB + c2o1 * vx2_SWT) /
+              c8o1;
+        az  = (-vx1_NEB + vx1_NET - vx1_NWB + vx1_NWT - vx1_SEB + vx1_SET - vx1_SWB + vx1_SWT) / c4o1;
+        bz  = (-vx2_NEB + vx2_NET - vx2_NWB + vx2_NWT - vx2_SEB + vx2_SET - vx2_SWB + vx2_SWT) / c4o1;
+        cz  = (-vx3_NEB + vx3_NET - vx3_NWB + vx3_NWT - vx3_SEB + vx3_SET - vx3_SWB + vx3_SWT) / c4o1;
+        azz = (-kxzFromfcNEQ_NEB + kxzFromfcNEQ_NET - kxzFromfcNEQ_NWB + kxzFromfcNEQ_NWT - kxzFromfcNEQ_SEB +
+               kxzFromfcNEQ_SET - kxzFromfcNEQ_SWB + kxzFromfcNEQ_SWT + c2o1 * vx3_NEB - c2o1 * vx3_NET -
+               c2o1 * vx3_NWB + c2o1 * vx3_NWT + c2o1 * vx3_SEB - c2o1 * vx3_SET - c2o1 * vx3_SWB + c2o1 * vx3_SWT) /
+              c8o1;
+        bzz = (-kyzFromfcNEQ_NEB + kyzFromfcNEQ_NET - kyzFromfcNEQ_NWB + kyzFromfcNEQ_NWT - kyzFromfcNEQ_SEB +
+               kyzFromfcNEQ_SET - kyzFromfcNEQ_SWB + kyzFromfcNEQ_SWT + c2o1 * vx3_NEB - c2o1 * vx3_NET +
+               c2o1 * vx3_NWB - c2o1 * vx3_NWT - c2o1 * vx3_SEB + c2o1 * vx3_SET - c2o1 * vx3_SWB + c2o1 * vx3_SWT) /
+              c8o1;
+        czz = (-kxxMyyFromfcNEQ_NEB + kxxMyyFromfcNEQ_NET - kxxMyyFromfcNEQ_NWB + kxxMyyFromfcNEQ_NWT -
+               kxxMyyFromfcNEQ_SEB + kxxMyyFromfcNEQ_SET - kxxMyyFromfcNEQ_SWB + kxxMyyFromfcNEQ_SWT +
+               c2o1 * kxxMzzFromfcNEQ_NEB - c2o1 * kxxMzzFromfcNEQ_NET + c2o1 * kxxMzzFromfcNEQ_NWB -
+               c2o1 * kxxMzzFromfcNEQ_NWT + c2o1 * kxxMzzFromfcNEQ_SEB - c2o1 * kxxMzzFromfcNEQ_SET +
+               c2o1 * kxxMzzFromfcNEQ_SWB - c2o1 * kxxMzzFromfcNEQ_SWT - c2o1 * vx1_NEB + c2o1 * vx1_NET +
+               c2o1 * vx1_NWB - c2o1 * vx1_NWT - c2o1 * vx1_SEB + c2o1 * vx1_SET + c2o1 * vx1_SWB - c2o1 * vx1_SWT -
+               c2o1 * vx2_NEB + c2o1 * vx2_NET - c2o1 * vx2_NWB + c2o1 * vx2_NWT + c2o1 * vx2_SEB - c2o1 * vx2_SET +
+               c2o1 * vx2_SWB - c2o1 * vx2_SWT) /
+              c16o1;
+        axy = (vx1_NEB + vx1_NET - vx1_NWB - vx1_NWT - vx1_SEB - vx1_SET + vx1_SWB + vx1_SWT) / c2o1;
+        bxy = (vx2_NEB + vx2_NET - vx2_NWB - vx2_NWT - vx2_SEB - vx2_SET + vx2_SWB + vx2_SWT) / c2o1;
+        cxy = (vx3_NEB + vx3_NET - vx3_NWB - vx3_NWT - vx3_SEB - vx3_SET + vx3_SWB + vx3_SWT) / c2o1;
+        axz = (-vx1_NEB + vx1_NET + vx1_NWB - vx1_NWT - vx1_SEB + vx1_SET + vx1_SWB - vx1_SWT) / c2o1;
+        bxz = (-vx2_NEB + vx2_NET + vx2_NWB - vx2_NWT - vx2_SEB + vx2_SET + vx2_SWB - vx2_SWT) / c2o1;
+        cxz = (-vx3_NEB + vx3_NET + vx3_NWB - vx3_NWT - vx3_SEB + vx3_SET + vx3_SWB - vx3_SWT) / c2o1;
+        ayz = (-vx1_NEB + vx1_NET - vx1_NWB + vx1_NWT + vx1_SEB - vx1_SET + vx1_SWB - vx1_SWT) / c2o1;
+        byz = (-vx2_NEB + vx2_NET - vx2_NWB + vx2_NWT + vx2_SEB - vx2_SET + vx2_SWB - vx2_SWT) / c2o1;
+        cyz = (-vx3_NEB + vx3_NET - vx3_NWB + vx3_NWT + vx3_SEB - vx3_SET + vx3_SWB - vx3_SWT) / c2o1;
+        // axyz=-vx1_NEB + vx1_NET + vx1_NWB - vx1_NWT + vx1_SEB - vx1_SET - vx1_SWB + vx1_SWT;
+        // bxyz=-vx2_NEB + vx2_NET + vx2_NWB - vx2_NWT + vx2_SEB - vx2_SET - vx2_SWB + vx2_SWT;
+        // cxyz=-vx3_NEB + vx3_NET + vx3_NWB - vx3_NWT + vx3_SEB - vx3_SET - vx3_SWB + vx3_SWT;
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        real kxyAverage    = c0o1;
+        real kyzAverage    = c0o1;
+        real kxzAverage    = c0o1;
+        real kxxMyyAverage = c0o1;
+        real kxxMzzAverage = c0o1;
+        // real kxyAverage	 =(kxyFromfcNEQ_SWB+
+        //				   kxyFromfcNEQ_SWT+
+        //				   kxyFromfcNEQ_SET+
+        //				   kxyFromfcNEQ_SEB+
+        //				   kxyFromfcNEQ_NWB+
+        //				   kxyFromfcNEQ_NWT+
+        //				   kxyFromfcNEQ_NET+
+        //				   kxyFromfcNEQ_NEB)*c1o8-(ay+bx);
+        // real kyzAverage	 =(kyzFromfcNEQ_SWB+
+        //				   kyzFromfcNEQ_SWT+
+        //				   kyzFromfcNEQ_SET+
+        //				   kyzFromfcNEQ_SEB+
+        //				   kyzFromfcNEQ_NWB+
+        //				   kyzFromfcNEQ_NWT+
+        //				   kyzFromfcNEQ_NET+
+        //				   kyzFromfcNEQ_NEB)*c1o8-(bz+cy);
+        // real kxzAverage	 =(kxzFromfcNEQ_SWB+
+        //				   kxzFromfcNEQ_SWT+
+        //				   kxzFromfcNEQ_SET+
+        //				   kxzFromfcNEQ_SEB+
+        //				   kxzFromfcNEQ_NWB+
+        //				   kxzFromfcNEQ_NWT+
+        //				   kxzFromfcNEQ_NET+
+        //				   kxzFromfcNEQ_NEB)*c1o8-(az+cx);
+        // real kxxMyyAverage	 =(kxxMyyFromfcNEQ_SWB+
+        //				   kxxMyyFromfcNEQ_SWT+
+        //				   kxxMyyFromfcNEQ_SET+
+        //				   kxxMyyFromfcNEQ_SEB+
+        //				   kxxMyyFromfcNEQ_NWB+
+        //				   kxxMyyFromfcNEQ_NWT+
+        //				   kxxMyyFromfcNEQ_NET+
+        //				   kxxMyyFromfcNEQ_NEB)*c1o8-(ax-by);
+        // real kxxMzzAverage	 =(kxxMzzFromfcNEQ_SWB+
+        //				   kxxMzzFromfcNEQ_SWT+
+        //				   kxxMzzFromfcNEQ_SET+
+        //				   kxxMzzFromfcNEQ_SEB+
+        //				   kxxMzzFromfcNEQ_NWB+
+        //				   kxxMzzFromfcNEQ_NWT+
+        //				   kxxMzzFromfcNEQ_NET+
+        //				   kxxMzzFromfcNEQ_NEB)*c1o8-(ax-cz);
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        ////Press
+        // d0   = ( press_NEB + press_NET + press_NWB + press_NWT + press_SEB + press_SET + press_SWB + press_SWT) *
+        // c1o8; dx   = ( press_NEB + press_NET - press_NWB - press_NWT + press_SEB + press_SET - press_SWB - press_SWT)
+        // * c1o4; dy   = ( press_NEB + press_NET + press_NWB + press_NWT - press_SEB - press_SET - press_SWB -
+        // press_SWT) * c1o4; dz   = (-press_NEB + press_NET - press_NWB + press_NWT - press_SEB + press_SET - press_SWB
+        // + press_SWT) * c1o4; dxy  = ( press_NEB + press_NET - press_NWB - press_NWT - press_SEB - press_SET +
+        // press_SWB + press_SWT) * c1o2; dxz  = (-press_NEB + press_NET + press_NWB - press_NWT - press_SEB + press_SET
+        // + press_SWB - press_SWT) * c1o2; dyz  = (-press_NEB + press_NET - press_NWB + press_NWT + press_SEB -
+        // press_SET + press_SWB - press_SWT) * c1o2; dxyz =  -press_NEB + press_NET + press_NWB - press_NWT + press_SEB
+        // - press_SET - press_SWB + press_SWT;
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // drho
+        real LapRho = ((xoff != c0o1) || (yoff != c0o1) || (zoff != c0o1))
+                          ? c0o1
+                          : -c3o1 * (ax * ax + by * by + cz * cz) - c6o1 * (bx * ay + cx * az + cy * bz);
+        d0 = (drho_NEB + drho_NET + drho_NWB + drho_NWT + drho_SEB + drho_SET + drho_SWB + drho_SWT - c2o1 * LapRho) *
+             c1o8;
+        dx  = (drho_NEB + drho_NET - drho_NWB - drho_NWT + drho_SEB + drho_SET - drho_SWB - drho_SWT) * c1o4;
+        dy  = (drho_NEB + drho_NET + drho_NWB + drho_NWT - drho_SEB - drho_SET - drho_SWB - drho_SWT) * c1o4;
+        dz  = (-drho_NEB + drho_NET - drho_NWB + drho_NWT - drho_SEB + drho_SET - drho_SWB + drho_SWT) * c1o4;
+        dxy = (drho_NEB + drho_NET - drho_NWB - drho_NWT - drho_SEB - drho_SET + drho_SWB + drho_SWT) * c1o2;
+        dxz = (-drho_NEB + drho_NET + drho_NWB - drho_NWT - drho_SEB + drho_SET + drho_SWB - drho_SWT) * c1o2;
+        dyz = (-drho_NEB + drho_NET - drho_NWB + drho_NWT + drho_SEB - drho_SET + drho_SWB - drho_SWT) * c1o2;
+        // dxyz =  -drho_NEB + drho_NET + drho_NWB - drho_NWT + drho_SEB - drho_SET - drho_SWB + drho_SWT;
+        // d0   = zero;
+        // dx   = zero;
+        // dy   = zero;
+        // dz   = zero;
+        // dxy  = zero;
+        // dxz  = zero;
+        // dyz  = zero;
+        // dxyz = zero;
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        //
+        // Bernd das Brot
+        //
+        //
+        // x------x
+        // |      |
+        // |	 ---+--->X
+        // |		|  \
+	  // x------x   \
+	  //			off-vector
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        a0 = a0 + xoff * ax + yoff * ay + zoff * az + xoff_sq * axx + yoff_sq * ayy + zoff_sq * azz +
+             xoff * yoff * axy + xoff * zoff * axz + yoff * zoff * ayz;
+        ax = ax + c2o1 * xoff * axx + yoff * axy + zoff * axz;
+        ay = ay + c2o1 * yoff * ayy + xoff * axy + zoff * ayz;
+        az = az + c2o1 * zoff * azz + xoff * axz + yoff * ayz;
+        b0 = b0 + xoff * bx + yoff * by + zoff * bz + xoff_sq * bxx + yoff_sq * byy + zoff_sq * bzz +
+             xoff * yoff * bxy + xoff * zoff * bxz + yoff * zoff * byz;
+        bx = bx + c2o1 * xoff * bxx + yoff * bxy + zoff * bxz;
+        by = by + c2o1 * yoff * byy + xoff * bxy + zoff * byz;
+        bz = bz + c2o1 * zoff * bzz + xoff * bxz + yoff * byz;
+        c0 = c0 + xoff * cx + yoff * cy + zoff * cz + xoff_sq * cxx + yoff_sq * cyy + zoff_sq * czz +
+             xoff * yoff * cxy + xoff * zoff * cxz + yoff * zoff * cyz;
+        cx = cx + c2o1 * xoff * cxx + yoff * cxy + zoff * cxz;
+        cy = cy + c2o1 * yoff * cyy + xoff * cxy + zoff * cyz;
+        cz = cz + c2o1 * zoff * czz + xoff * cxz + yoff * cyz;
+        d0 = d0 + xoff * dx + yoff * dy + zoff * dz + xoff * yoff * dxy + xoff * zoff * dxz + yoff * zoff * dyz;
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        //  FIX
+        //  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // AAAAAAAAAAAAHHHHHHHHHHHH!!!!! Mieser Test!!!
+        // b0= bx= by= bz= bxx= byy= bzz= bxy= bxz= byz= c0= cx= cy= cz= cxx= cyy= czz= cxy= cxz= cyz= axyz= bxyz=
+        // cxyz=zero; b0=zero; bx=zero; by=zero; bz=zero; bxx=zero; byy=zero; bzz=zero; bxy=zero; bxz=zero; byz=zero;
+        // c0=zero;
+        // cx=zero;
+        // cy=zero;
+        // cz=zero;
+        // cxx=zero;
+        // cyy=zero;
+        // czz=zero;
+        // cxy=zero;
+        // cxz=zero;
+        // cyz=zero;
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        real mfcbb = c0o1;
+        real mfabb = c0o1;
+        real mfbcb = c0o1;
+        real mfbab = c0o1;
+        real mfbbc = c0o1;
+        real mfbba = c0o1;
+        real mfccb = c0o1;
+        real mfaab = c0o1;
+        real mfcab = c0o1;
+        real mfacb = c0o1;
+        real mfcbc = c0o1;
+        real mfaba = c0o1;
+        real mfcba = c0o1;
+        real mfabc = c0o1;
+        real mfbcc = c0o1;
+        real mfbaa = c0o1;
+        real mfbca = c0o1;
+        real mfbac = c0o1;
+        real mfbbb = c0o1;
+        real mfccc = c0o1;
+        real mfaac = c0o1;
+        real mfcac = c0o1;
+        real mfacc = c0o1;
+        real mfcca = c0o1;
+        real mfaaa = c0o1;
+        real mfcaa = c0o1;
+        real mfaca = c0o1;
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        real m0, m1, m2, vvx, vvy, vvz, vx2, vy2, vz2, oMdrho;
+        real mxxPyyPzz, mxxMyy, mxxMzz, mxxyPyzz, mxxyMyzz, mxxzPyyz, mxxzMyyz, mxyyPxzz, mxyyMxzz;
+        // real qudricLimit = c1o100;//ganz schlechte Idee -> muss global sein
+        // real O3 = c2o1 - o;
+        // real residu, residutmp;
+        // residutmp = c0o1;///*-*/ c2o9 * (1./o - c1o2) * eps_new * eps_new;
+        real NeqOn = c1o1; // zero;//one;   //.... one = on ..... zero = off
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        //
+        // Position C 0., 0., 0.
+        //
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // x = 0.;
+        // y = 0.;
+        // z = 0.;
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // real mxoff = -xoff;
+        // real myoff = -yoff;
+        // real mzoff = -zoff;
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // press = press_NET * (c1o8 - c1o4 * mxoff - c1o4 * myoff - c1o4 * mzoff) +
+        //  press_NWT * (c1o8 + c1o4 * mxoff - c1o4 * myoff - c1o4 * mzoff) +
+        //  press_SET * (c1o8 - c1o4 * mxoff + c1o4 * myoff - c1o4 * mzoff) +
+        //  press_SWT * (c1o8 + c1o4 * mxoff + c1o4 * myoff - c1o4 * mzoff) +
+        //  press_NEB * (c1o8 - c1o4 * mxoff - c1o4 * myoff + c1o4 * mzoff) +
+        //  press_NWB * (c1o8 + c1o4 * mxoff - c1o4 * myoff + c1o4 * mzoff) +
+        //  press_SEB * (c1o8 - c1o4 * mxoff + c1o4 * myoff + c1o4 * mzoff) +
+        //  press_SWB * (c1o8 + c1o4 * mxoff + c1o4 * myoff + c1o4 * mzoff);
+        // drho  = drho_NET * (c1o8 - c1o4 * xoff - c1o4 * yoff - c1o4 * zoff) +
+        //  drho_NWT * (c1o8 + c1o4 * xoff - c1o4 * yoff - c1o4 * zoff) +
+        //  drho_SET * (c1o8 - c1o4 * xoff + c1o4 * yoff - c1o4 * zoff) +
+        //  drho_SWT * (c1o8 + c1o4 * xoff + c1o4 * yoff - c1o4 * zoff) +
+        //  drho_NEB * (c1o8 - c1o4 * xoff - c1o4 * yoff + c1o4 * zoff) +
+        //  drho_NWB * (c1o8 + c1o4 * xoff - c1o4 * yoff + c1o4 * zoff) +
+        //  drho_SEB * (c1o8 - c1o4 * xoff + c1o4 * yoff + c1o4 * zoff) +
+        //  drho_SWB * (c1o8 + c1o4 * xoff + c1o4 * yoff + c1o4 * zoff);
+        press = d0;
+        vvx   = a0;
+        vvy   = b0;
+        vvz   = c0;
+
+        // mfaaa = drho;
+        // mfaaa = press + (ax+by+cz)/three;  //  1/3 = 2/3*(1/op-1/2)
+        mfaaa = press; // if drho is interpolated directly
+
+        vx2    = vvx * vvx;
+        vy2    = vvy * vvy;
+        vz2    = vvz * vvz;
+        oMdrho = c1o1;
+        // oMdrho = one - mfaaa;
+
+        // two
+        // linear combinations
+        mxxPyyPzz = mfaaa;
+        // mxxMyy    = -c2o3*(ax - by)*eps_new/o;
+        // mxxMzz    = -c2o3*(ax - cz)*eps_new/o;
+
+        // mfabb     = -c1o3 * (bz + cy)*eps_new/o;
+        // mfbab     = -c1o3 * (az + cx)*eps_new/o;
+        // mfbba     = -c1o3 * (ay + bx)*eps_new/o;
+        mxxMyy = -c2o3 * ((ax - by) + kxxMyyAverage) * eps_new / o * (c1o1 + press);
+        mxxMzz = -c2o3 * ((ax - cz) + kxxMzzAverage) * eps_new / o * (c1o1 + press);
+
+        mfabb = -c1o3 * ((bz + cy) + kyzAverage) * eps_new / o * (c1o1 + press);
+        mfbab = -c1o3 * ((az + cx) + kxzAverage) * eps_new / o * (c1o1 + press);
+        mfbba = -c1o3 * ((ay + bx) + kxyAverage) * eps_new / o * (c1o1 + press);
+
+        // linear combinations back
+        mfcaa = c1o3 * (mxxMyy + mxxMzz + mxxPyyPzz) * NeqOn;
+        mfaca = c1o3 * (-c2o1 * mxxMyy + mxxMzz + mxxPyyPzz) * NeqOn;
+        mfaac = c1o3 * (mxxMyy - c2o1 * mxxMzz + mxxPyyPzz) * NeqOn;
+
+        // 3.
+        // linear combinations
+        // residu = residutmp * (ayz + bxz + cxy );
+        // mfbbb = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
+        mfbbb = c0o1;
+
+        // residu = residutmp * (axy + two*bxx + two*bzz + cyz );
+        // residu = -(c1o9*(axy - 2*bxx - 2*bzz + cyz ));
+        // mxxyPyzz = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
+        mxxyPyzz = c0o1;
+
+        // residu = residutmp * (axy + two*bxx - two*bzz - cyz );
+        // residu = c1o9*(axy - 2*bxx + 2*bzz - cyz );
+        // mxxyMyzz = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
+        mxxyMyzz = c0o1;
+
+        // residu = residutmp * (axz + byz + two*cxx + two*cyy );
+        // residu = -(c1o9*(axz + byz - 2*cxx - 2*cyy ));
+        // mxxzPyyz = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
+        mxxzPyyz = c0o1;
+
+        // residu = residutmp * (axz - byz + two*cxx - two*cyy );
+        // residu = c1o9*(axz - byz - 2*cxx + 2*cyy );
+        // mxxzMyyz = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
+        mxxzMyyz = c0o1;
+
+        // residu = residutmp * (two*ayy + two*azz + bxy + cxz );
+        // residu = c1o9*(2*ayy + 2*azz - bxy - cxz );
+        // mxyyPxzz = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
+        mxyyPxzz = c0o1;
+
+        // residu = residutmp * (two*ayy - two*azz + bxy - cxz );
+        // residu = c1o9*(-2*ayy + 2*azz + bxy - cxz );
+        // mxyyMxzz = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
+        mxyyMxzz = c0o1;
+
+        // linear combinations back
+        mfcba = (mxxyMyzz + mxxyPyzz) * c1o2;
+        mfabc = (-mxxyMyzz + mxxyPyzz) * c1o2;
+        mfcab = (mxxzMyyz + mxxzPyyz) * c1o2;
+        mfacb = (-mxxzMyyz + mxxzPyyz) * c1o2;
+        mfbca = (mxyyMxzz + mxyyPxzz) * c1o2;
+        mfbac = (-mxyyMxzz + mxyyPxzz) * c1o2;
+
+        // 4.
+        mfacc = mfaaa * c1o9;
+        mfcac = mfacc;
+        mfcca = mfacc;
+        // 5.
+
+        // 6.
+        mfccc = mfaaa * c1o27;
+        ////////////////////////////////////////////////////////////////////////////////////
+        // back
+        ////////////////////////////////////////////////////////////////////////////////////
+        // mit 1, 0, 1/3, 0, 0, 0, 1/3, 0, 1/9   Konditionieren
+        ////////////////////////////////////////////////////////////////////////////////////
+        // Z - Dir
+        m0    = mfaac * c1o2 + mfaab * (vvz - c1o2) + (mfaaa + c1o1 * oMdrho) * (vz2 - vvz) * c1o2;
+        m1    = -mfaac - c2o1 * mfaab * vvz + mfaaa * (c1o1 - vz2) - c1o1 * oMdrho * vz2;
+        m2    = mfaac * c1o2 + mfaab * (vvz + c1o2) + (mfaaa + c1o1 * oMdrho) * (vz2 + vvz) * c1o2;
+        mfaaa = m0;
+        mfaab = m1;
+        mfaac = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        m0    = mfabc * c1o2 + mfabb * (vvz - c1o2) + mfaba * (vz2 - vvz) * c1o2;
+        m1    = -mfabc - c2o1 * mfabb * vvz + mfaba * (c1o1 - vz2);
+        m2    = mfabc * c1o2 + mfabb * (vvz + c1o2) + mfaba * (vz2 + vvz) * c1o2;
+        mfaba = m0;
+        mfabb = m1;
+        mfabc = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        m0    = mfacc * c1o2 + mfacb * (vvz - c1o2) + (mfaca + c1o3 * oMdrho) * (vz2 - vvz) * c1o2;
+        m1    = -mfacc - c2o1 * mfacb * vvz + mfaca * (c1o1 - vz2) - c1o3 * oMdrho * vz2;
+        m2    = mfacc * c1o2 + mfacb * (vvz + c1o2) + (mfaca + c1o3 * oMdrho) * (vz2 + vvz) * c1o2;
+        mfaca = m0;
+        mfacb = m1;
+        mfacc = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////////////////
+        m0    = mfbac * c1o2 + mfbab * (vvz - c1o2) + mfbaa * (vz2 - vvz) * c1o2;
+        m1    = -mfbac - c2o1 * mfbab * vvz + mfbaa * (c1o1 - vz2);
+        m2    = mfbac * c1o2 + mfbab * (vvz + c1o2) + mfbaa * (vz2 + vvz) * c1o2;
+        mfbaa = m0;
+        mfbab = m1;
+        mfbac = m2;
+        /////////b//////////////////////////////////////////////////////////////////////////
+        m0    = mfbbc * c1o2 + mfbbb * (vvz - c1o2) + mfbba * (vz2 - vvz) * c1o2;
+        m1    = -mfbbc - c2o1 * mfbbb * vvz + mfbba * (c1o1 - vz2);
+        m2    = mfbbc * c1o2 + mfbbb * (vvz + c1o2) + mfbba * (vz2 + vvz) * c1o2;
+        mfbba = m0;
+        mfbbb = m1;
+        mfbbc = m2;
+        /////////b//////////////////////////////////////////////////////////////////////////
+        m0    = mfbcc * c1o2 + mfbcb * (vvz - c1o2) + mfbca * (vz2 - vvz) * c1o2;
+        m1    = -mfbcc - c2o1 * mfbcb * vvz + mfbca * (c1o1 - vz2);
+        m2    = mfbcc * c1o2 + mfbcb * (vvz + c1o2) + mfbca * (vz2 + vvz) * c1o2;
+        mfbca = m0;
+        mfbcb = m1;
+        mfbcc = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////////////////
+        m0    = mfcac * c1o2 + mfcab * (vvz - c1o2) + (mfcaa + c1o3 * oMdrho) * (vz2 - vvz) * c1o2;
+        m1    = -mfcac - c2o1 * mfcab * vvz + mfcaa * (c1o1 - vz2) - c1o3 * oMdrho * vz2;
+        m2    = mfcac * c1o2 + mfcab * (vvz + c1o2) + (mfcaa + c1o3 * oMdrho) * (vz2 + vvz) * c1o2;
+        mfcaa = m0;
+        mfcab = m1;
+        mfcac = m2;
+        /////////c//////////////////////////////////////////////////////////////////////////
+        m0    = mfcbc * c1o2 + mfcbb * (vvz - c1o2) + mfcba * (vz2 - vvz) * c1o2;
+        m1    = -mfcbc - c2o1 * mfcbb * vvz + mfcba * (c1o1 - vz2);
+        m2    = mfcbc * c1o2 + mfcbb * (vvz + c1o2) + mfcba * (vz2 + vvz) * c1o2;
+        mfcba = m0;
+        mfcbb = m1;
+        mfcbc = m2;
+        /////////c//////////////////////////////////////////////////////////////////////////
+        m0    = mfccc * c1o2 + mfccb * (vvz - c1o2) + (mfcca + c1o9 * oMdrho) * (vz2 - vvz) * c1o2;
+        m1    = -mfccc - c2o1 * mfccb * vvz + mfcca * (c1o1 - vz2) - c1o9 * oMdrho * vz2;
+        m2    = mfccc * c1o2 + mfccb * (vvz + c1o2) + (mfcca + c1o9 * oMdrho) * (vz2 + vvz) * c1o2;
+        mfcca = m0;
+        mfccb = m1;
+        mfccc = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////////////////
+        // mit 1/6, 2/3, 1/6, 0, 0, 0, 1/18, 2/9, 1/18   Konditionieren
+        ////////////////////////////////////////////////////////////////////////////////////
+        // Y - Dir
+        m0    = mfaca * c1o2 + mfaba * (vvy - c1o2) + (mfaaa + c1o6 * oMdrho) * (vy2 - vvy) * c1o2;
+        m1    = -mfaca - c2o1 * mfaba * vvy + mfaaa * (c1o1 - vy2) - c1o6 * oMdrho * vy2;
+        m2    = mfaca * c1o2 + mfaba * (vvy + c1o2) + (mfaaa + c1o6 * oMdrho) * (vy2 + vvy) * c1o2;
+        mfaaa = m0;
+        mfaba = m1;
+        mfaca = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        m0    = mfacb * c1o2 + mfabb * (vvy - c1o2) + (mfaab + c2o3 * oMdrho) * (vy2 - vvy) * c1o2;
+        m1    = -mfacb - c2o1 * mfabb * vvy + mfaab * (c1o1 - vy2) - c2o3 * oMdrho * vy2;
+        m2    = mfacb * c1o2 + mfabb * (vvy + c1o2) + (mfaab + c2o3 * oMdrho) * (vy2 + vvy) * c1o2;
+        mfaab = m0;
+        mfabb = m1;
+        mfacb = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        m0    = mfacc * c1o2 + mfabc * (vvy - c1o2) + (mfaac + c1o6 * oMdrho) * (vy2 - vvy) * c1o2;
+        m1    = -mfacc - c2o1 * mfabc * vvy + mfaac * (c1o1 - vy2) - c1o6 * oMdrho * vy2;
+        m2    = mfacc * c1o2 + mfabc * (vvy + c1o2) + (mfaac + c1o6 * oMdrho) * (vy2 + vvy) * c1o2;
+        mfaac = m0;
+        mfabc = m1;
+        mfacc = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////////////////
+        m0    = mfbca * c1o2 + mfbba * (vvy - c1o2) + mfbaa * (vy2 - vvy) * c1o2;
+        m1    = -mfbca - c2o1 * mfbba * vvy + mfbaa * (c1o1 - vy2);
+        m2    = mfbca * c1o2 + mfbba * (vvy + c1o2) + mfbaa * (vy2 + vvy) * c1o2;
+        mfbaa = m0;
+        mfbba = m1;
+        mfbca = m2;
+        /////////b//////////////////////////////////////////////////////////////////////////
+        m0    = mfbcb * c1o2 + mfbbb * (vvy - c1o2) + mfbab * (vy2 - vvy) * c1o2;
+        m1    = -mfbcb - c2o1 * mfbbb * vvy + mfbab * (c1o1 - vy2);
+        m2    = mfbcb * c1o2 + mfbbb * (vvy + c1o2) + mfbab * (vy2 + vvy) * c1o2;
+        mfbab = m0;
+        mfbbb = m1;
+        mfbcb = m2;
+        /////////b//////////////////////////////////////////////////////////////////////////
+        m0    = mfbcc * c1o2 + mfbbc * (vvy - c1o2) + mfbac * (vy2 - vvy) * c1o2;
+        m1    = -mfbcc - c2o1 * mfbbc * vvy + mfbac * (c1o1 - vy2);
+        m2    = mfbcc * c1o2 + mfbbc * (vvy + c1o2) + mfbac * (vy2 + vvy) * c1o2;
+        mfbac = m0;
+        mfbbc = m1;
+        mfbcc = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////////////////
+        m0    = mfcca * c1o2 + mfcba * (vvy - c1o2) + (mfcaa + c1o18 * oMdrho) * (vy2 - vvy) * c1o2;
+        m1    = -mfcca - c2o1 * mfcba * vvy + mfcaa * (c1o1 - vy2) - c1o18 * oMdrho * vy2;
+        m2    = mfcca * c1o2 + mfcba * (vvy + c1o2) + (mfcaa + c1o18 * oMdrho) * (vy2 + vvy) * c1o2;
+        mfcaa = m0;
+        mfcba = m1;
+        mfcca = m2;
+        /////////c//////////////////////////////////////////////////////////////////////////
+        m0    = mfccb * c1o2 + mfcbb * (vvy - c1o2) + (mfcab + c2o9 * oMdrho) * (vy2 - vvy) * c1o2;
+        m1    = -mfccb - c2o1 * mfcbb * vvy + mfcab * (c1o1 - vy2) - c2o9 * oMdrho * vy2;
+        m2    = mfccb * c1o2 + mfcbb * (vvy + c1o2) + (mfcab + c2o9 * oMdrho) * (vy2 + vvy) * c1o2;
+        mfcab = m0;
+        mfcbb = m1;
+        mfccb = m2;
+        /////////c//////////////////////////////////////////////////////////////////////////
+        m0    = mfccc * c1o2 + mfcbc * (vvy - c1o2) + (mfcac + c1o18 * oMdrho) * (vy2 - vvy) * c1o2;
+        m1    = -mfccc - c2o1 * mfcbc * vvy + mfcac * (c1o1 - vy2) - c1o18 * oMdrho * vy2;
+        m2    = mfccc * c1o2 + mfcbc * (vvy + c1o2) + (mfcac + c1o18 * oMdrho) * (vy2 + vvy) * c1o2;
+        mfcac = m0;
+        mfcbc = m1;
+        mfccc = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////////////////
+        // mit 1/36, 1/9, 1/36, 1/9, 4/9, 1/9, 1/36, 1/9, 1/36 Konditionieren
+        ////////////////////////////////////////////////////////////////////////////////////
+        // X - Dir
+        m0    = mfcaa * c1o2 + mfbaa * (vvx - c1o2) + (mfaaa + c1o36 * oMdrho) * (vx2 - vvx) * c1o2;
+        m1    = -mfcaa - c2o1 * mfbaa * vvx + mfaaa * (c1o1 - vx2) - c1o36 * oMdrho * vx2;
+        m2    = mfcaa * c1o2 + mfbaa * (vvx + c1o2) + (mfaaa + c1o36 * oMdrho) * (vx2 + vvx) * c1o2;
+        mfaaa = m0;
+        mfbaa = m1;
+        mfcaa = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        m0    = mfcba * c1o2 + mfbba * (vvx - c1o2) + (mfaba + c1o9 * oMdrho) * (vx2 - vvx) * c1o2;
+        m1    = -mfcba - c2o1 * mfbba * vvx + mfaba * (c1o1 - vx2) - c1o9 * oMdrho * vx2;
+        m2    = mfcba * c1o2 + mfbba * (vvx + c1o2) + (mfaba + c1o9 * oMdrho) * (vx2 + vvx) * c1o2;
+        mfaba = m0;
+        mfbba = m1;
+        mfcba = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        m0    = mfcca * c1o2 + mfbca * (vvx - c1o2) + (mfaca + c1o36 * oMdrho) * (vx2 - vvx) * c1o2;
+        m1    = -mfcca - c2o1 * mfbca * vvx + mfaca * (c1o1 - vx2) - c1o36 * oMdrho * vx2;
+        m2    = mfcca * c1o2 + mfbca * (vvx + c1o2) + (mfaca + c1o36 * oMdrho) * (vx2 + vvx) * c1o2;
+        mfaca = m0;
+        mfbca = m1;
+        mfcca = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////////////////
+        m0    = mfcab * c1o2 + mfbab * (vvx - c1o2) + (mfaab + c1o9 * oMdrho) * (vx2 - vvx) * c1o2;
+        m1    = -mfcab - c2o1 * mfbab * vvx + mfaab * (c1o1 - vx2) - c1o9 * oMdrho * vx2;
+        m2    = mfcab * c1o2 + mfbab * (vvx + c1o2) + (mfaab + c1o9 * oMdrho) * (vx2 + vvx) * c1o2;
+        mfaab = m0;
+        mfbab = m1;
+        mfcab = m2;
+        ///////////b////////////////////////////////////////////////////////////////////////
+        m0    = mfcbb * c1o2 + mfbbb * (vvx - c1o2) + (mfabb + c4o9 * oMdrho) * (vx2 - vvx) * c1o2;
+        m1    = -mfcbb - c2o1 * mfbbb * vvx + mfabb * (c1o1 - vx2) - c4o9 * oMdrho * vx2;
+        m2    = mfcbb * c1o2 + mfbbb * (vvx + c1o2) + (mfabb + c4o9 * oMdrho) * (vx2 + vvx) * c1o2;
+        mfabb = m0;
+        mfbbb = m1;
+        mfcbb = m2;
+        ///////////b////////////////////////////////////////////////////////////////////////
+        m0    = mfccb * c1o2 + mfbcb * (vvx - c1o2) + (mfacb + c1o9 * oMdrho) * (vx2 - vvx) * c1o2;
+        m1    = -mfccb - c2o1 * mfbcb * vvx + mfacb * (c1o1 - vx2) - c1o9 * oMdrho * vx2;
+        m2    = mfccb * c1o2 + mfbcb * (vvx + c1o2) + (mfacb + c1o9 * oMdrho) * (vx2 + vvx) * c1o2;
+        mfacb = m0;
+        mfbcb = m1;
+        mfccb = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////////////////
+        m0    = mfcac * c1o2 + mfbac * (vvx - c1o2) + (mfaac + c1o36 * oMdrho) * (vx2 - vvx) * c1o2;
+        m1    = -mfcac - c2o1 * mfbac * vvx + mfaac * (c1o1 - vx2) - c1o36 * oMdrho * vx2;
+        m2    = mfcac * c1o2 + mfbac * (vvx + c1o2) + (mfaac + c1o36 * oMdrho) * (vx2 + vvx) * c1o2;
+        mfaac = m0;
+        mfbac = m1;
+        mfcac = m2;
+        ///////////c////////////////////////////////////////////////////////////////////////
+        m0    = mfcbc * c1o2 + mfbbc * (vvx - c1o2) + (mfabc + c1o9 * oMdrho) * (vx2 - vvx) * c1o2;
+        m1    = -mfcbc - c2o1 * mfbbc * vvx + mfabc * (c1o1 - vx2) - c1o9 * oMdrho * vx2;
+        m2    = mfcbc * c1o2 + mfbbc * (vvx + c1o2) + (mfabc + c1o9 * oMdrho) * (vx2 + vvx) * c1o2;
+        mfabc = m0;
+        mfbbc = m1;
+        mfcbc = m2;
+        ///////////c////////////////////////////////////////////////////////////////////////
+        m0    = mfccc * c1o2 + mfbcc * (vvx - c1o2) + (mfacc + c1o36 * oMdrho) * (vx2 - vvx) * c1o2;
+        m1    = -mfccc - c2o1 * mfbcc * vvx + mfacc * (c1o1 - vx2) - c1o36 * oMdrho * vx2;
+        m2    = mfccc * c1o2 + mfbcc * (vvx + c1o2) + (mfacc + c1o36 * oMdrho) * (vx2 + vvx) * c1o2;
+        mfacc = m0;
+        mfbcc = m1;
+        mfccc = m2;
+        ////////////////////////////////////////////////////////////////////////////////////
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        // index 0
+        kzero = posC[k];
+        kw    = neighborCX[kzero];
+        ks    = neighborCY[kzero];
+        kb    = neighborCZ[kzero];
+        ksw   = neighborCY[kw];
+        kbw   = neighborCZ[kw];
+        kbs   = neighborCZ[ks];
+        kbsw  = neighborCZ[ksw];
+        ////////////////////////////////////////////////////////////////////////////////////
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        feC[kzero]    = mfcbb;
+        fwC[kw]       = mfabb;
+        fnC[kzero]    = mfbcb;
+        fsC[ks]       = mfbab;
+        ftC[kzero]    = mfbbc;
+        fbC[kb]       = mfbba;
+        fneC[kzero]   = mfccb;
+        fswC[ksw]     = mfaab;
+        fseC[ks]      = mfcab;
+        fnwC[kw]      = mfacb;
+        fteC[kzero]   = mfcbc;
+        fbwC[kbw]     = mfaba;
+        fbeC[kb]      = mfcba;
+        ftwC[kw]      = mfabc;
+        ftnC[kzero]   = mfbcc;
+        fbsC[kbs]     = mfbaa;
+        fbnC[kb]      = mfbca;
+        ftsC[ks]      = mfbac;
+        fzeroC[kzero] = mfbbb;
+        ftneC[kzero]  = mfccc;
+        ftseC[ks]     = mfcac;
+        fbneC[kb]     = mfcca;
+        fbseC[kbs]    = mfcaa;
+        ftnwC[kw]     = mfacc;
+        ftswC[ksw]    = mfaac;
+        fbnwC[kbw]    = mfaca;
+        fbswC[kbsw]   = mfaaa;
+        ////////////////////////////////////////////////////////////////////////////////////
+    }
+}
+
 extern "C" __global__ void scaleFC_RhoSq_comp_27(real* DC, 
 												 real* DF, 
 												 unsigned int* neighborCX,
@@ -9609,100 +11078,6 @@ extern "C" __global__ void scaleFC_RhoSq_comp_27(real* DC,
 												 unsigned int nyF,
 												 OffFC offFC)
 {
-   real *feF, *fwF, *fnF, *fsF, *ftF, *fbF, *fneF, *fswF, *fseF, *fnwF, *fteF, *fbwF, *fbeF, *ftwF, *ftnF, *fbsF, *fbnF, *ftsF, *fzeroF, 
-      *ftneF, *ftswF, *ftseF, *ftnwF, *fbneF, *fbswF, *fbseF, *fbnwF;
-
-   feF    = &DF[dirE   *size_MatF];
-   fwF    = &DF[dirW   *size_MatF];
-   fnF    = &DF[dirN   *size_MatF];
-   fsF    = &DF[dirS   *size_MatF];
-   ftF    = &DF[dirT   *size_MatF];
-   fbF    = &DF[dirB   *size_MatF];
-   fneF   = &DF[dirNE  *size_MatF];
-   fswF   = &DF[dirSW  *size_MatF];
-   fseF   = &DF[dirSE  *size_MatF];
-   fnwF   = &DF[dirNW  *size_MatF];
-   fteF   = &DF[dirTE  *size_MatF];
-   fbwF   = &DF[dirBW  *size_MatF];
-   fbeF   = &DF[dirBE  *size_MatF];
-   ftwF   = &DF[dirTW  *size_MatF];
-   ftnF   = &DF[dirTN  *size_MatF];
-   fbsF   = &DF[dirBS  *size_MatF];
-   fbnF   = &DF[dirBN  *size_MatF];
-   ftsF   = &DF[dirTS  *size_MatF];
-   fzeroF = &DF[dirZERO*size_MatF];
-   ftneF  = &DF[dirTNE *size_MatF];
-   ftswF  = &DF[dirTSW *size_MatF];
-   ftseF  = &DF[dirTSE *size_MatF];
-   ftnwF  = &DF[dirTNW *size_MatF];
-   fbneF  = &DF[dirBNE *size_MatF];
-   fbswF  = &DF[dirBSW *size_MatF];
-   fbseF  = &DF[dirBSE *size_MatF];
-   fbnwF  = &DF[dirBNW *size_MatF];
-
-   real *feC, *fwC, *fnC, *fsC, *ftC, *fbC, *fneC, *fswC, *fseC, *fnwC, *fteC, *fbwC, *fbeC, *ftwC, *ftnC, *fbsC, *fbnC, *ftsC, *fzeroC,
-      *ftneC, *ftswC, *ftseC, *ftnwC, *fbneC, *fbswC, *fbseC, *fbnwC;
-
-   if (evenOrOdd==true)
-   {
-      feC    = &DC[dirE   *size_MatC];
-      fwC    = &DC[dirW   *size_MatC];
-      fnC    = &DC[dirN   *size_MatC];
-      fsC    = &DC[dirS   *size_MatC];
-      ftC    = &DC[dirT   *size_MatC];
-      fbC    = &DC[dirB   *size_MatC];
-      fneC   = &DC[dirNE  *size_MatC];
-      fswC   = &DC[dirSW  *size_MatC];
-      fseC   = &DC[dirSE  *size_MatC];
-      fnwC   = &DC[dirNW  *size_MatC];
-      fteC   = &DC[dirTE  *size_MatC];
-      fbwC   = &DC[dirBW  *size_MatC];
-      fbeC   = &DC[dirBE  *size_MatC];
-      ftwC   = &DC[dirTW  *size_MatC];
-      ftnC   = &DC[dirTN  *size_MatC];
-      fbsC   = &DC[dirBS  *size_MatC];
-      fbnC   = &DC[dirBN  *size_MatC];
-      ftsC   = &DC[dirTS  *size_MatC];
-      fzeroC = &DC[dirZERO*size_MatC];
-      ftneC  = &DC[dirTNE *size_MatC];
-      ftswC  = &DC[dirTSW *size_MatC];
-      ftseC  = &DC[dirTSE *size_MatC];
-      ftnwC  = &DC[dirTNW *size_MatC];
-      fbneC  = &DC[dirBNE *size_MatC];
-      fbswC  = &DC[dirBSW *size_MatC];
-      fbseC  = &DC[dirBSE *size_MatC];
-      fbnwC  = &DC[dirBNW *size_MatC];
-   } 
-   else
-   {
-      fwC    = &DC[dirE   *size_MatC];
-      feC    = &DC[dirW   *size_MatC];
-      fsC    = &DC[dirN   *size_MatC];
-      fnC    = &DC[dirS   *size_MatC];
-      fbC    = &DC[dirT   *size_MatC];
-      ftC    = &DC[dirB   *size_MatC];
-      fswC   = &DC[dirNE  *size_MatC];
-      fneC   = &DC[dirSW  *size_MatC];
-      fnwC   = &DC[dirSE  *size_MatC];
-      fseC   = &DC[dirNW  *size_MatC];
-      fbwC   = &DC[dirTE  *size_MatC];
-      fteC   = &DC[dirBW  *size_MatC];
-      ftwC   = &DC[dirBE  *size_MatC];
-      fbeC   = &DC[dirTW  *size_MatC];
-      fbsC   = &DC[dirTN  *size_MatC];
-      ftnC   = &DC[dirBS  *size_MatC];
-      ftsC   = &DC[dirBN  *size_MatC];
-      fbnC   = &DC[dirTS  *size_MatC];
-      fzeroC = &DC[dirZERO*size_MatC];
-      fbswC  = &DC[dirTNE *size_MatC];
-      fbneC  = &DC[dirTSW *size_MatC];
-      fbnwC  = &DC[dirTSE *size_MatC];
-      fbseC  = &DC[dirTNW *size_MatC];
-      ftswC  = &DC[dirBNE *size_MatC];
-      ftneC  = &DC[dirBSW *size_MatC];
-      ftnwC  = &DC[dirBSE *size_MatC];
-      ftseC  = &DC[dirBNW *size_MatC];
-   }
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  ix = threadIdx.x;  // Globaler x-Index 
    const unsigned  iy = blockIdx.x;   // Globaler y-Index 
@@ -9714,1177 +11089,10 @@ extern "C" __global__ void scaleFC_RhoSq_comp_27(real* DC,
    const unsigned k = nx*(ny*iz + iy) + ix;
    //////////////////////////////////////////////////////////////////////////
 
-   ////////////////////////////////////////////////////////////////////////////////
-   real eps_new = c2o1;
-   real omegaS = omFine;//-omFine;
-   real o  = omCoarse;//-omCoarse;
-   //real op = one;
-   //real cu_sq;
-
-   real xoff,    yoff,    zoff;
-   real xoff_sq, yoff_sq, zoff_sq;
-
-   real        press;//,drho,vx1,vx2,vx3;
-   real        /*press_SWT,*/drho_SWT,vx1_SWT,vx2_SWT,vx3_SWT;
-   real        /*press_NWT,*/drho_NWT,vx1_NWT,vx2_NWT,vx3_NWT;
-   real        /*press_NET,*/drho_NET,vx1_NET,vx2_NET,vx3_NET;
-   real        /*press_SET,*/drho_SET,vx1_SET,vx2_SET,vx3_SET;
-   real        /*press_SWB,*/drho_SWB,vx1_SWB,vx2_SWB,vx3_SWB;
-   real        /*press_NWB,*/drho_NWB,vx1_NWB,vx2_NWB,vx3_NWB;
-   real        /*press_NEB,*/drho_NEB,vx1_NEB,vx2_NEB,vx3_NEB;
-   real        /*press_SEB,*/drho_SEB,vx1_SEB,vx2_SEB,vx3_SEB;
-   real        f_E,f_W,f_N,f_S,f_T,f_B,f_NE,f_SW,f_SE,f_NW,f_TE,f_BW,f_BE,f_TW,f_TN,f_BS,f_BN,f_TS,f_ZERO,f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
-   //real        feq_E,feq_W,feq_N,feq_S,feq_T,feq_B,feq_NE,feq_SW,feq_SE,feq_NW,feq_TE,feq_BW,feq_BE,feq_TW,feq_TN,feq_BS,feq_BN,feq_TS,feq_ZERO,feq_TNE, feq_TSW, feq_TSE, feq_TNW, feq_BNE, feq_BSW, feq_BSE, feq_BNW;
-   real        kxyFromfcNEQ_SWT, kyzFromfcNEQ_SWT, kxzFromfcNEQ_SWT, kxxMyyFromfcNEQ_SWT, kxxMzzFromfcNEQ_SWT;
-   real        kxyFromfcNEQ_NWT, kyzFromfcNEQ_NWT, kxzFromfcNEQ_NWT, kxxMyyFromfcNEQ_NWT, kxxMzzFromfcNEQ_NWT;
-   real        kxyFromfcNEQ_NET, kyzFromfcNEQ_NET, kxzFromfcNEQ_NET, kxxMyyFromfcNEQ_NET, kxxMzzFromfcNEQ_NET;
-   real        kxyFromfcNEQ_SET, kyzFromfcNEQ_SET, kxzFromfcNEQ_SET, kxxMyyFromfcNEQ_SET, kxxMzzFromfcNEQ_SET;
-   real        kxyFromfcNEQ_SWB, kyzFromfcNEQ_SWB, kxzFromfcNEQ_SWB, kxxMyyFromfcNEQ_SWB, kxxMzzFromfcNEQ_SWB;
-   real        kxyFromfcNEQ_NWB, kyzFromfcNEQ_NWB, kxzFromfcNEQ_NWB, kxxMyyFromfcNEQ_NWB, kxxMzzFromfcNEQ_NWB;
-   real        kxyFromfcNEQ_NEB, kyzFromfcNEQ_NEB, kxzFromfcNEQ_NEB, kxxMyyFromfcNEQ_NEB, kxxMzzFromfcNEQ_NEB;
-   real        kxyFromfcNEQ_SEB, kyzFromfcNEQ_SEB, kxzFromfcNEQ_SEB, kxxMyyFromfcNEQ_SEB, kxxMzzFromfcNEQ_SEB;
-   real        a0, ax, ay, az, axx, ayy, azz, axy, axz, ayz, b0, bx, by, bz, bxx, byy, bzz, bxy, bxz, byz, c0, cx, cy, cz, cxx, cyy, czz, cxy, cxz, cyz/*, axyz, bxyz, cxyz*/;
-   real        d0, dx, dy, dz, dxy, dxz, dyz/*, dxyz*/;
-
-   if(k<kFC)
-   {
-      //////////////////////////////////////////////////////////////////////////
-      xoff = offFC.xOffFC[k];
-      yoff = offFC.yOffFC[k];
-      zoff = offFC.zOffFC[k];      
-      xoff_sq = xoff * xoff;
-      yoff_sq = yoff * yoff;
-      zoff_sq = zoff * zoff;
-      //////////////////////////////////////////////////////////////////////////
-      //SWB//
-      //////////////////////////////////////////////////////////////////////////
-      //index 0
-      unsigned int k0zero= posFSWB[k];
-      unsigned int k0w   = neighborFX[k0zero];
-      unsigned int k0s   = neighborFY[k0zero];
-      unsigned int k0b   = neighborFZ[k0zero];
-      unsigned int k0sw  = neighborFY[k0w];
-      unsigned int k0bw  = neighborFZ[k0w];
-      unsigned int k0bs  = neighborFZ[k0s];
-      unsigned int k0bsw = neighborFZ[k0sw];
-      //////////////////////////////////////////////////////////////////////////
-      //index 
-      unsigned int kzero= k0zero;
-      unsigned int kw   = k0w;   
-      unsigned int ks   = k0s;   
-      unsigned int kb   = k0b;   
-      unsigned int ksw  = k0sw;  
-      unsigned int kbw  = k0bw;  
-      unsigned int kbs  = k0bs;  
-      unsigned int kbsw = k0bsw; 
-      ////////////////////////////////////////////////////////////////////////////////
-      f_E    = feF[kzero];
-      f_W    = fwF[kw];
-      f_N    = fnF[kzero];
-      f_S    = fsF[ks];
-      f_T    = ftF[kzero];
-      f_B    = fbF[kb];
-      f_NE   = fneF[kzero];
-      f_SW   = fswF[ksw];
-      f_SE   = fseF[ks];
-      f_NW   = fnwF[kw];
-      f_TE   = fteF[kzero];
-      f_BW   = fbwF[kbw];
-      f_BE   = fbeF[kb];
-      f_TW   = ftwF[kw];
-      f_TN   = ftnF[kzero];
-      f_BS   = fbsF[kbs];
-      f_BN   = fbnF[kb];
-      f_TS   = ftsF[ks];
-      f_ZERO = fzeroF[kzero];
-      f_TNE  = ftneF[kzero];
-      f_TSW  = ftswF[ksw];
-      f_TSE  = ftseF[ks];
-      f_TNW  = ftnwF[kw];
-      f_BNE  = fbneF[kb];
-      f_BSW  = fbswF[kbsw];
-      f_BSE  = fbseF[kbs];
-      f_BNW  = fbnwF[kbw];
-
-      drho_SWB = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-      vx1_SWB  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(c1o1 + drho_SWB);
-	  vx2_SWB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(c1o1 + drho_SWB);
-	  vx3_SWB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(c1o1 + drho_SWB);
-
-      kxyFromfcNEQ_SWB    = -c3o1*omegaS*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (c1o1 + drho_SWB) - ((vx1_SWB*vx2_SWB)));
-      kyzFromfcNEQ_SWB    = -c3o1*omegaS*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (c1o1 + drho_SWB) - ((vx2_SWB*vx3_SWB)));
-      kxzFromfcNEQ_SWB    = -c3o1*omegaS*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (c1o1 + drho_SWB) - ((vx1_SWB*vx3_SWB)));
-      kxxMyyFromfcNEQ_SWB = -c3o2*omegaS *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (c1o1 + drho_SWB) - ((vx1_SWB*vx1_SWB-vx2_SWB*vx2_SWB)));
-      kxxMzzFromfcNEQ_SWB = -c3o2*omegaS *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (c1o1 + drho_SWB) - ((vx1_SWB*vx1_SWB-vx3_SWB*vx3_SWB)));
-
-	  
-      //////////////////////////////////////////////////////////////////////////
-      //SWT//
-      //////////////////////////////////////////////////////////////////////////
-      //index 
-      kzero= kb;
-      kw   = kbw;   
-      ks   = kbs;   
-      kb   = neighborFZ[kb];   
-      ksw  = kbsw;  
-      kbw  = neighborFZ[kbw];  
-      kbs  = neighborFZ[kbs];  
-      kbsw = neighborFZ[kbsw]; 
-      ////////////////////////////////////////////////////////////////////////////////
-      f_E    = feF[kzero];
-      f_W    = fwF[kw];
-      f_N    = fnF[kzero];
-      f_S    = fsF[ks];
-      f_T    = ftF[kzero];
-      f_B    = fbF[kb];
-      f_NE   = fneF[kzero];
-      f_SW   = fswF[ksw];
-      f_SE   = fseF[ks];
-      f_NW   = fnwF[kw];
-      f_TE   = fteF[kzero];
-      f_BW   = fbwF[kbw];
-      f_BE   = fbeF[kb];
-      f_TW   = ftwF[kw];
-      f_TN   = ftnF[kzero];
-      f_BS   = fbsF[kbs];
-      f_BN   = fbnF[kb];
-      f_TS   = ftsF[ks];
-      f_ZERO = fzeroF[kzero];
-      f_TNE  = ftneF[kzero];
-      f_TSW  = ftswF[ksw];
-      f_TSE  = ftseF[ks];
-      f_TNW  = ftnwF[kw];
-      f_BNE  = fbneF[kb];
-      f_BSW  = fbswF[kbsw];
-      f_BSE  = fbseF[kbs];
-      f_BNW  = fbnwF[kbw];
-
-      drho_SWT = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-      vx1_SWT  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(c1o1 + drho_SWT);
-	  vx2_SWT  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(c1o1 + drho_SWT);
-	  vx3_SWT  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(c1o1 + drho_SWT);
-
-      kxyFromfcNEQ_SWT    = -c3o1*omegaS*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (c1o1 + drho_SWT) - ((vx1_SWT*vx2_SWT)));
-      kyzFromfcNEQ_SWT    = -c3o1*omegaS*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (c1o1 + drho_SWT) - ((vx2_SWT*vx3_SWT)));
-      kxzFromfcNEQ_SWT    = -c3o1*omegaS*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (c1o1 + drho_SWT) - ((vx1_SWT*vx3_SWT)));
-      kxxMyyFromfcNEQ_SWT = -c3o2*omegaS *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (c1o1 + drho_SWT) - ((vx1_SWT*vx1_SWT-vx2_SWT*vx2_SWT)));
-      kxxMzzFromfcNEQ_SWT = -c3o2*omegaS *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (c1o1 + drho_SWT) - ((vx1_SWT*vx1_SWT-vx3_SWT*vx3_SWT)));
-
-      //////////////////////////////////////////////////////////////////////////
-      //SET//
-      //////////////////////////////////////////////////////////////////////////
-      //index 
-      kzero= kw;
-      kw   = neighborFX[kw];   
-      ks   = ksw;   
-      kb   = kbw;   
-      ksw  = neighborFX[ksw];  
-      kbw  = neighborFX[kbw];  
-      kbs  = kbsw;  
-      kbsw = neighborFX[kbsw]; 
-      ////////////////////////////////////////////////////////////////////////////////
-      f_E    = feF[kzero];
-      f_W    = fwF[kw];
-      f_N    = fnF[kzero];
-      f_S    = fsF[ks];
-      f_T    = ftF[kzero];
-      f_B    = fbF[kb];
-      f_NE   = fneF[kzero];
-      f_SW   = fswF[ksw];
-      f_SE   = fseF[ks];
-      f_NW   = fnwF[kw];
-      f_TE   = fteF[kzero];
-      f_BW   = fbwF[kbw];
-      f_BE   = fbeF[kb];
-      f_TW   = ftwF[kw];
-      f_TN   = ftnF[kzero];
-      f_BS   = fbsF[kbs];
-      f_BN   = fbnF[kb];
-      f_TS   = ftsF[ks];
-      f_ZERO = fzeroF[kzero];
-      f_TNE  = ftneF[kzero];
-      f_TSW  = ftswF[ksw];
-      f_TSE  = ftseF[ks];
-      f_TNW  = ftnwF[kw];
-      f_BNE  = fbneF[kb];
-      f_BSW  = fbswF[kbsw];
-      f_BSE  = fbseF[kbs];
-      f_BNW  = fbnwF[kbw];
-
-      drho_SET = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-      vx1_SET  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(c1o1 + drho_SET);
-	  vx2_SET  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(c1o1 + drho_SET);
-	  vx3_SET  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(c1o1 + drho_SET);
-
-      kxyFromfcNEQ_SET    = -c3o1*omegaS*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (c1o1 + drho_SET) - ((vx1_SET*vx2_SET)));
-      kyzFromfcNEQ_SET    = -c3o1*omegaS*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (c1o1 + drho_SET) - ((vx2_SET*vx3_SET)));
-      kxzFromfcNEQ_SET    = -c3o1*omegaS*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (c1o1 + drho_SET) - ((vx1_SET*vx3_SET)));
-      kxxMyyFromfcNEQ_SET = -c3o2*omegaS *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (c1o1 + drho_SET) - ((vx1_SET*vx1_SET-vx2_SET*vx2_SET)));
-      kxxMzzFromfcNEQ_SET = -c3o2*omegaS *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (c1o1 + drho_SET) - ((vx1_SET*vx1_SET-vx3_SET*vx3_SET)));
-
-      //////////////////////////////////////////////////////////////////////////
-      //SEB//
-      //////////////////////////////////////////////////////////////////////////
-      //index 
-      kb   = kzero;   
-      kbw  = kw;  
-      kbs  = ks;  
-      kbsw = ksw; 
-      kzero= k0w;
-      kw   = neighborFX[k0w];   
-      ks   = k0sw;   
-      ksw  = neighborFX[k0sw];  
-      ////////////////////////////////////////////////////////////////////////////////
-      f_E    = feF[kzero];
-      f_W    = fwF[kw];
-      f_N    = fnF[kzero];
-      f_S    = fsF[ks];
-      f_T    = ftF[kzero];
-      f_B    = fbF[kb];
-      f_NE   = fneF[kzero];
-      f_SW   = fswF[ksw];
-      f_SE   = fseF[ks];
-      f_NW   = fnwF[kw];
-      f_TE   = fteF[kzero];
-      f_BW   = fbwF[kbw];
-      f_BE   = fbeF[kb];
-      f_TW   = ftwF[kw];
-      f_TN   = ftnF[kzero];
-      f_BS   = fbsF[kbs];
-      f_BN   = fbnF[kb];
-      f_TS   = ftsF[ks];
-      f_ZERO = fzeroF[kzero];
-      f_TNE  = ftneF[kzero];
-      f_TSW  = ftswF[ksw];
-      f_TSE  = ftseF[ks];
-      f_TNW  = ftnwF[kw];
-      f_BNE  = fbneF[kb];
-      f_BSW  = fbswF[kbsw];
-      f_BSE  = fbseF[kbs];
-      f_BNW  = fbnwF[kbw];
-
-      drho_SEB = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-      vx1_SEB  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(c1o1 + drho_SEB);
-	  vx2_SEB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(c1o1 + drho_SEB);
-	  vx3_SEB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(c1o1 + drho_SEB);
-
-      kxyFromfcNEQ_SEB    = -c3o1*omegaS*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (c1o1 + drho_SEB) - ((vx1_SEB*vx2_SEB)));
-      kyzFromfcNEQ_SEB    = -c3o1*omegaS*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (c1o1 + drho_SEB) - ((vx2_SEB*vx3_SEB)));
-      kxzFromfcNEQ_SEB    = -c3o1*omegaS*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (c1o1 + drho_SEB) - ((vx1_SEB*vx3_SEB)));
-      kxxMyyFromfcNEQ_SEB = -c3o2*omegaS *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (c1o1 + drho_SEB) - ((vx1_SEB*vx1_SEB-vx2_SEB*vx2_SEB)));
-      kxxMzzFromfcNEQ_SEB = -c3o2*omegaS *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (c1o1 + drho_SEB) - ((vx1_SEB*vx1_SEB-vx3_SEB*vx3_SEB)));
-
-      //////////////////////////////////////////////////////////////////////////
-      //NWB//
-      //////////////////////////////////////////////////////////////////////////
-      //index 0
-      k0zero= k0s;
-      k0w   = k0sw;
-      k0s   = neighborFY[k0s];
-      k0b   = k0bs;
-      k0sw  = neighborFY[k0sw];
-      k0bw  = k0bsw;
-      k0bs  = neighborFY[k0bs];
-      k0bsw = neighborFY[k0bsw];
-      //////////////////////////////////////////////////////////////////////////
-      //index 
-      kzero= k0zero;
-      kw   = k0w;   
-      ks   = k0s;   
-      kb   = k0b;   
-      ksw  = k0sw;  
-      kbw  = k0bw;  
-      kbs  = k0bs;  
-      kbsw = k0bsw; 
-      ////////////////////////////////////////////////////////////////////////////////
-      f_E    = feF[kzero];
-      f_W    = fwF[kw];
-      f_N    = fnF[kzero];
-      f_S    = fsF[ks];
-      f_T    = ftF[kzero];
-      f_B    = fbF[kb];
-      f_NE   = fneF[kzero];
-      f_SW   = fswF[ksw];
-      f_SE   = fseF[ks];
-      f_NW   = fnwF[kw];
-      f_TE   = fteF[kzero];
-      f_BW   = fbwF[kbw];
-      f_BE   = fbeF[kb];
-      f_TW   = ftwF[kw];
-      f_TN   = ftnF[kzero];
-      f_BS   = fbsF[kbs];
-      f_BN   = fbnF[kb];
-      f_TS   = ftsF[ks];
-      f_ZERO = fzeroF[kzero];
-      f_TNE  = ftneF[kzero];
-      f_TSW  = ftswF[ksw];
-      f_TSE  = ftseF[ks];
-      f_TNW  = ftnwF[kw];
-      f_BNE  = fbneF[kb];
-      f_BSW  = fbswF[kbsw];
-      f_BSE  = fbseF[kbs];
-      f_BNW  = fbnwF[kbw];
-
-      drho_NWB = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-      vx1_NWB  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(c1o1 + drho_NWB);
-	  vx2_NWB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(c1o1 + drho_NWB);
-	  vx3_NWB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(c1o1 + drho_NWB);
-
-      kxyFromfcNEQ_NWB    = -c3o1*omegaS*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (c1o1 + drho_NWB) - ((vx1_NWB*vx2_NWB)));
-      kyzFromfcNEQ_NWB    = -c3o1*omegaS*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (c1o1 + drho_NWB) - ((vx2_NWB*vx3_NWB)));
-      kxzFromfcNEQ_NWB    = -c3o1*omegaS*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (c1o1 + drho_NWB) - ((vx1_NWB*vx3_NWB)));
-      kxxMyyFromfcNEQ_NWB = -c3o2*omegaS *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (c1o1 + drho_NWB) - ((vx1_NWB*vx1_NWB-vx2_NWB*vx2_NWB)));
-      kxxMzzFromfcNEQ_NWB = -c3o2*omegaS *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (c1o1 + drho_NWB) - ((vx1_NWB*vx1_NWB-vx3_NWB*vx3_NWB)));
-
-      //////////////////////////////////////////////////////////////////////////
-      //NWT//
-      //////////////////////////////////////////////////////////////////////////
-      //index 
-      kzero= kb;
-      kw   = kbw;   
-      ks   = kbs;   
-      kb   = neighborFZ[kb];   
-      ksw  = kbsw;  
-      kbw  = neighborFZ[kbw];  
-      kbs  = neighborFZ[kbs];  
-      kbsw = neighborFZ[kbsw]; 
-      ////////////////////////////////////////////////////////////////////////////////
-      f_E    = feF[kzero];
-      f_W    = fwF[kw];
-      f_N    = fnF[kzero];
-      f_S    = fsF[ks];
-      f_T    = ftF[kzero];
-      f_B    = fbF[kb];
-      f_NE   = fneF[kzero];
-      f_SW   = fswF[ksw];
-      f_SE   = fseF[ks];
-      f_NW   = fnwF[kw];
-      f_TE   = fteF[kzero];
-      f_BW   = fbwF[kbw];
-      f_BE   = fbeF[kb];
-      f_TW   = ftwF[kw];
-      f_TN   = ftnF[kzero];
-      f_BS   = fbsF[kbs];
-      f_BN   = fbnF[kb];
-      f_TS   = ftsF[ks];
-      f_ZERO = fzeroF[kzero];
-      f_TNE  = ftneF[kzero];
-      f_TSW  = ftswF[ksw];
-      f_TSE  = ftseF[ks];
-      f_TNW  = ftnwF[kw];
-      f_BNE  = fbneF[kb];
-      f_BSW  = fbswF[kbsw];
-      f_BSE  = fbseF[kbs];
-      f_BNW  = fbnwF[kbw];
-
-      drho_NWT = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-      vx1_NWT  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(c1o1 + drho_NWT);
-	  vx2_NWT  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(c1o1 + drho_NWT);
-	  vx3_NWT  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(c1o1 + drho_NWT);
-
-      kxyFromfcNEQ_NWT    = -c3o1*omegaS*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (c1o1 + drho_NWT) - ((vx1_NWT*vx2_NWT)));
-      kyzFromfcNEQ_NWT    = -c3o1*omegaS*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (c1o1 + drho_NWT) - ((vx2_NWT*vx3_NWT)));
-      kxzFromfcNEQ_NWT    = -c3o1*omegaS*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (c1o1 + drho_NWT) - ((vx1_NWT*vx3_NWT)));
-      kxxMyyFromfcNEQ_NWT = -c3o2*omegaS *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (c1o1 + drho_NWT) - ((vx1_NWT*vx1_NWT-vx2_NWT*vx2_NWT)));
-      kxxMzzFromfcNEQ_NWT = -c3o2*omegaS *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (c1o1 + drho_NWT) - ((vx1_NWT*vx1_NWT-vx3_NWT*vx3_NWT)));
-
-      //////////////////////////////////////////////////////////////////////////
-      //NET//
-      //////////////////////////////////////////////////////////////////////////
-      //index 
-      kzero= kw;
-      kw   = neighborFX[kw];   
-      ks   = ksw;   
-      kb   = kbw;   
-      ksw  = neighborFX[ksw];  
-      kbw  = neighborFX[kbw];  
-      kbs  = kbsw;  
-      kbsw = neighborFX[kbsw]; 
-      ////////////////////////////////////////////////////////////////////////////////
-      f_E    = feF[kzero];
-      f_W    = fwF[kw];
-      f_N    = fnF[kzero];
-      f_S    = fsF[ks];
-      f_T    = ftF[kzero];
-      f_B    = fbF[kb];
-      f_NE   = fneF[kzero];
-      f_SW   = fswF[ksw];
-      f_SE   = fseF[ks];
-      f_NW   = fnwF[kw];
-      f_TE   = fteF[kzero];
-      f_BW   = fbwF[kbw];
-      f_BE   = fbeF[kb];
-      f_TW   = ftwF[kw];
-      f_TN   = ftnF[kzero];
-      f_BS   = fbsF[kbs];
-      f_BN   = fbnF[kb];
-      f_TS   = ftsF[ks];
-      f_ZERO = fzeroF[kzero];
-      f_TNE  = ftneF[kzero];
-      f_TSW  = ftswF[ksw];
-      f_TSE  = ftseF[ks];
-      f_TNW  = ftnwF[kw];
-      f_BNE  = fbneF[kb];
-      f_BSW  = fbswF[kbsw];
-      f_BSE  = fbseF[kbs];
-      f_BNW  = fbnwF[kbw];
-
-      drho_NET = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-      vx1_NET  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(c1o1 + drho_NET);
-	  vx2_NET  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(c1o1 + drho_NET);
-	  vx3_NET  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(c1o1 + drho_NET);
-
-      kxyFromfcNEQ_NET    = -c3o1*omegaS*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (c1o1 + drho_NET) - ((vx1_NET*vx2_NET)));
-      kyzFromfcNEQ_NET    = -c3o1*omegaS*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (c1o1 + drho_NET) - ((vx2_NET*vx3_NET)));
-      kxzFromfcNEQ_NET    = -c3o1*omegaS*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (c1o1 + drho_NET) - ((vx1_NET*vx3_NET)));
-      kxxMyyFromfcNEQ_NET = -c3o2*omegaS *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (c1o1 + drho_NET) - ((vx1_NET*vx1_NET-vx2_NET*vx2_NET)));
-      kxxMzzFromfcNEQ_NET = -c3o2*omegaS *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (c1o1 + drho_NET) - ((vx1_NET*vx1_NET-vx3_NET*vx3_NET)));
-
-      //////////////////////////////////////////////////////////////////////////
-      //NEB//
-      //////////////////////////////////////////////////////////////////////////
-      //index 
-      kb   = kzero;   
-      kbw  = kw;  
-      kbs  = ks;  
-      kbsw = ksw; 
-      kzero= k0w;
-      kw   = neighborFX[k0w];   
-      ks   = k0sw;   
-      ksw  = neighborFX[k0sw];  
-      ////////////////////////////////////////////////////////////////////////////////
-      f_E    = feF[kzero];
-      f_W    = fwF[kw];
-      f_N    = fnF[kzero];
-      f_S    = fsF[ks];
-      f_T    = ftF[kzero];
-      f_B    = fbF[kb];
-      f_NE   = fneF[kzero];
-      f_SW   = fswF[ksw];
-      f_SE   = fseF[ks];
-      f_NW   = fnwF[kw];
-      f_TE   = fteF[kzero];
-      f_BW   = fbwF[kbw];
-      f_BE   = fbeF[kb];
-      f_TW   = ftwF[kw];
-      f_TN   = ftnF[kzero];
-      f_BS   = fbsF[kbs];
-      f_BN   = fbnF[kb];
-      f_TS   = ftsF[ks];
-      f_ZERO = fzeroF[kzero];
-      f_TNE  = ftneF[kzero];
-      f_TSW  = ftswF[ksw];
-      f_TSE  = ftseF[ks];
-      f_TNW  = ftnwF[kw];
-      f_BNE  = fbneF[kb];
-      f_BSW  = fbswF[kbsw];
-      f_BSE  = fbseF[kbs];
-      f_BNW  = fbnwF[kbw];
-
-      drho_NEB = f_E+f_W+f_N+f_S+f_T+f_B+f_NE+f_SW+f_SE+f_NW+f_TE+f_BW+f_BE+f_TW+f_TN+f_BS+f_BN+f_TS+f_ZERO+f_TNE+f_TSW+f_TSE+f_TNW+f_BNE+f_BSW+f_BSE+f_BNW;
-      vx1_NEB  = (((f_TNE-f_BSW)+(f_TSE-f_BNW)+(f_BNE-f_TSW)+(f_BSE-f_TNW)) + (((f_NE-f_SW)+(f_TE-f_BW))+((f_SE-f_NW)+(f_BE-f_TW))) + (f_E-f_W))/(c1o1 + drho_NEB);
-	  vx2_NEB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_BNE-f_TSW)+(f_BNW-f_TSE)) + (((f_NE-f_SW)+(f_TN-f_BS))+((f_BN-f_TS)+(f_NW-f_SE))) + (f_N-f_S))/(c1o1 + drho_NEB);
-	  vx3_NEB  = (((f_TNE-f_BSW)+(f_TNW-f_BSE)+(f_TSE-f_BNW)+(f_TSW-f_BNE)) + (((f_TE-f_BW)+(f_TN-f_BS))+((f_TW-f_BE)+(f_TS-f_BN))) + (f_T-f_B))/(c1o1 + drho_NEB);
-
-      kxyFromfcNEQ_NEB    = -c3o1*omegaS*((f_SW+f_BSW+f_TSW-f_NW-f_BNW-f_TNW-f_SE-f_BSE-f_TSE+f_NE+f_BNE+f_TNE ) / (c1o1 + drho_NEB) - ((vx1_NEB*vx2_NEB)));
-      kyzFromfcNEQ_NEB    = -c3o1*omegaS*((f_BS+f_BSE+f_BSW-f_TS-f_TSE-f_TSW-f_BN-f_BNE-f_BNW+f_TN+f_TNE+f_TNW ) / (c1o1 + drho_NEB) - ((vx2_NEB*vx3_NEB)));
-      kxzFromfcNEQ_NEB    = -c3o1*omegaS*((f_BW+f_BSW+f_BNW-f_TW-f_TSW-f_TNW-f_BE-f_BSE-f_BNE+f_TE+f_TSE+f_TNE ) / (c1o1 + drho_NEB) - ((vx1_NEB*vx3_NEB)));
-      kxxMyyFromfcNEQ_NEB = -c3o2*omegaS *((f_BW+f_W+f_TW-f_BS-f_S-f_TS-f_BN-f_N-f_TN+f_BE+f_E+f_TE             ) / (c1o1 + drho_NEB) - ((vx1_NEB*vx1_NEB-vx2_NEB*vx2_NEB)));
-      kxxMzzFromfcNEQ_NEB = -c3o2*omegaS *((f_SW+f_W+f_NW-f_BS-f_TS-f_B-f_T-f_BN-f_TN+f_SE+f_E+f_NE             ) / (c1o1 + drho_NEB) - ((vx1_NEB*vx1_NEB-vx3_NEB*vx3_NEB)));
-
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //kxyFromfcNEQ_SWB    = zero;
-	  //kyzFromfcNEQ_SWB    = zero;
-	  //kxzFromfcNEQ_SWB    = zero;
-	  //kxxMyyFromfcNEQ_SWB = zero;
-	  //kxxMzzFromfcNEQ_SWB = zero;
-	  //kxyFromfcNEQ_SWT    = zero;
-	  //kyzFromfcNEQ_SWT    = zero;
-	  //kxzFromfcNEQ_SWT    = zero;
-	  //kxxMyyFromfcNEQ_SWT = zero;
-	  //kxxMzzFromfcNEQ_SWT = zero;
-	  //kxyFromfcNEQ_SET    = zero;
-	  //kyzFromfcNEQ_SET    = zero;
-	  //kxzFromfcNEQ_SET    = zero;
-	  //kxxMyyFromfcNEQ_SET = zero;
-	  //kxxMzzFromfcNEQ_SET = zero;
-	  //kxyFromfcNEQ_SEB    = zero;
-	  //kyzFromfcNEQ_SEB    = zero;
-	  //kxzFromfcNEQ_SEB    = zero;
-	  //kxxMyyFromfcNEQ_SEB = zero;
-	  //kxxMzzFromfcNEQ_SEB = zero;
-	  //kxyFromfcNEQ_NWB    = zero;
-	  //kyzFromfcNEQ_NWB    = zero;
-	  //kxzFromfcNEQ_NWB    = zero;
-	  //kxxMyyFromfcNEQ_NWB = zero;
-	  //kxxMzzFromfcNEQ_NWB = zero;
-	  //kxyFromfcNEQ_NWT    = zero;
-	  //kyzFromfcNEQ_NWT    = zero;
-	  //kxzFromfcNEQ_NWT    = zero;
-	  //kxxMyyFromfcNEQ_NWT = zero;
-	  //kxxMzzFromfcNEQ_NWT = zero;
-	  //kxyFromfcNEQ_NET    = zero;
-	  //kyzFromfcNEQ_NET    = zero;
-	  //kxzFromfcNEQ_NET    = zero;
-	  //kxxMyyFromfcNEQ_NET = zero;
-	  //kxxMzzFromfcNEQ_NET = zero;
-	  //kxyFromfcNEQ_NEB    = zero;
-	  //kyzFromfcNEQ_NEB    = zero;
-	  //kxzFromfcNEQ_NEB    = zero;
-	  //kxxMyyFromfcNEQ_NEB = zero;
-	  //kxxMzzFromfcNEQ_NEB = zero;
-      //////////////////////////////////////////////////////////////////////////
-      //3
-      //////////////////////////////////////////////////////////////////////////
-      a0 = (-kxxMyyFromfcNEQ_NEB - kxxMyyFromfcNEQ_NET + kxxMyyFromfcNEQ_NWB + kxxMyyFromfcNEQ_NWT - 
-			 kxxMyyFromfcNEQ_SEB - kxxMyyFromfcNEQ_SET + kxxMyyFromfcNEQ_SWB + kxxMyyFromfcNEQ_SWT - 
-			 kxxMzzFromfcNEQ_NEB - kxxMzzFromfcNEQ_NET + kxxMzzFromfcNEQ_NWB + kxxMzzFromfcNEQ_NWT - 
-			 kxxMzzFromfcNEQ_SEB - kxxMzzFromfcNEQ_SET + kxxMzzFromfcNEQ_SWB + kxxMzzFromfcNEQ_SWT - 
-			 c2o1*kxyFromfcNEQ_NEB - c2o1*kxyFromfcNEQ_NET - c2o1*kxyFromfcNEQ_NWB - c2o1*kxyFromfcNEQ_NWT + 
-			 c2o1*kxyFromfcNEQ_SEB + c2o1*kxyFromfcNEQ_SET + c2o1*kxyFromfcNEQ_SWB + c2o1*kxyFromfcNEQ_SWT + 
-			 c2o1*kxzFromfcNEQ_NEB - c2o1*kxzFromfcNEQ_NET + c2o1*kxzFromfcNEQ_NWB - c2o1*kxzFromfcNEQ_NWT + 
-			 c2o1*kxzFromfcNEQ_SEB - c2o1*kxzFromfcNEQ_SET + c2o1*kxzFromfcNEQ_SWB - c2o1*kxzFromfcNEQ_SWT + 
-			 c8o1*vx1_NEB + c8o1*vx1_NET + c8o1*vx1_NWB + c8o1*vx1_NWT + c8o1*vx1_SEB + 
-			 c8o1*vx1_SET + c8o1*vx1_SWB + c8o1*vx1_SWT + c2o1*vx2_NEB + c2o1*vx2_NET - 
-			 c2o1*vx2_NWB - c2o1*vx2_NWT - c2o1*vx2_SEB - c2o1*vx2_SET + c2o1*vx2_SWB + 
-			 c2o1*vx2_SWT - c2o1*vx3_NEB + c2o1*vx3_NET + c2o1*vx3_NWB - c2o1*vx3_NWT - 
-			 c2o1*vx3_SEB + c2o1*vx3_SET + c2o1*vx3_SWB - c2o1*vx3_SWT)/c64o1;
-      b0 = (c2o1*kxxMyyFromfcNEQ_NEB + c2o1*kxxMyyFromfcNEQ_NET + c2o1*kxxMyyFromfcNEQ_NWB + c2o1*kxxMyyFromfcNEQ_NWT - 
-			 c2o1*kxxMyyFromfcNEQ_SEB - c2o1*kxxMyyFromfcNEQ_SET - c2o1*kxxMyyFromfcNEQ_SWB - c2o1*kxxMyyFromfcNEQ_SWT - 
-			 kxxMzzFromfcNEQ_NEB - kxxMzzFromfcNEQ_NET - kxxMzzFromfcNEQ_NWB - kxxMzzFromfcNEQ_NWT + 
-			 kxxMzzFromfcNEQ_SEB + kxxMzzFromfcNEQ_SET + kxxMzzFromfcNEQ_SWB + kxxMzzFromfcNEQ_SWT - 
-			 c2o1*kxyFromfcNEQ_NEB - c2o1*kxyFromfcNEQ_NET + c2o1*kxyFromfcNEQ_NWB + c2o1*kxyFromfcNEQ_NWT - 
-			 c2o1*kxyFromfcNEQ_SEB - c2o1*kxyFromfcNEQ_SET + c2o1*kxyFromfcNEQ_SWB + c2o1*kxyFromfcNEQ_SWT + 
-			 c2o1*kyzFromfcNEQ_NEB - c2o1*kyzFromfcNEQ_NET + c2o1*kyzFromfcNEQ_NWB - c2o1*kyzFromfcNEQ_NWT + 
-			 c2o1*kyzFromfcNEQ_SEB - c2o1*kyzFromfcNEQ_SET + c2o1*kyzFromfcNEQ_SWB - c2o1*kyzFromfcNEQ_SWT + 
-			 c2o1*vx1_NEB + c2o1*vx1_NET - c2o1*vx1_NWB - c2o1*vx1_NWT - 
-			 c2o1*vx1_SEB - c2o1*vx1_SET + c2o1*vx1_SWB + c2o1*vx1_SWT + 
-			 c8o1*vx2_NEB + c8o1*vx2_NET + c8o1*vx2_NWB + c8o1*vx2_NWT + 
-			 c8o1*vx2_SEB + c8o1*vx2_SET + c8o1*vx2_SWB + c8o1*vx2_SWT - 
-			 c2o1*vx3_NEB + c2o1*vx3_NET - c2o1*vx3_NWB + c2o1*vx3_NWT + 
-			 c2o1*vx3_SEB - c2o1*vx3_SET + c2o1*vx3_SWB - c2o1*vx3_SWT)/c64o1;
-      c0 = (kxxMyyFromfcNEQ_NEB - kxxMyyFromfcNEQ_NET + kxxMyyFromfcNEQ_NWB - kxxMyyFromfcNEQ_NWT + 
-			 kxxMyyFromfcNEQ_SEB - kxxMyyFromfcNEQ_SET + kxxMyyFromfcNEQ_SWB - kxxMyyFromfcNEQ_SWT - 
-			 c2o1*kxxMzzFromfcNEQ_NEB + c2o1*kxxMzzFromfcNEQ_NET - c2o1*kxxMzzFromfcNEQ_NWB + c2o1*kxxMzzFromfcNEQ_NWT - 
-			 c2o1*kxxMzzFromfcNEQ_SEB + c2o1*kxxMzzFromfcNEQ_SET - c2o1*kxxMzzFromfcNEQ_SWB + c2o1*kxxMzzFromfcNEQ_SWT - 
-			 c2o1*kxzFromfcNEQ_NEB - c2o1*kxzFromfcNEQ_NET + c2o1*kxzFromfcNEQ_NWB + c2o1*kxzFromfcNEQ_NWT - 
-			 c2o1*kxzFromfcNEQ_SEB - c2o1*kxzFromfcNEQ_SET + c2o1*kxzFromfcNEQ_SWB + c2o1*kxzFromfcNEQ_SWT - 
-			 c2o1*kyzFromfcNEQ_NEB - c2o1*kyzFromfcNEQ_NET - c2o1*kyzFromfcNEQ_NWB - c2o1*kyzFromfcNEQ_NWT + 
-			 c2o1*kyzFromfcNEQ_SEB + c2o1*kyzFromfcNEQ_SET + c2o1*kyzFromfcNEQ_SWB + c2o1*kyzFromfcNEQ_SWT - 
-			 c2o1*vx1_NEB + c2o1*vx1_NET + c2o1*vx1_NWB - c2o1*vx1_NWT - 
-			 c2o1*vx1_SEB + c2o1*vx1_SET + c2o1*vx1_SWB - c2o1*vx1_SWT - 
-			 c2o1*vx2_NEB + c2o1*vx2_NET - c2o1*vx2_NWB + c2o1*vx2_NWT + 
-			 c2o1*vx2_SEB - c2o1*vx2_SET + c2o1*vx2_SWB - c2o1*vx2_SWT + 
-			 c8o1*vx3_NEB + c8o1*vx3_NET + c8o1*vx3_NWB + c8o1*vx3_NWT + 
-			 c8o1*vx3_SEB + c8o1*vx3_SET + c8o1*vx3_SWB + c8o1*vx3_SWT)/c64o1;
-      ax = (vx1_NEB + vx1_NET - vx1_NWB - vx1_NWT + vx1_SEB + vx1_SET - vx1_SWB - vx1_SWT)/c4o1;
-      bx = (vx2_NEB + vx2_NET - vx2_NWB - vx2_NWT + vx2_SEB + vx2_SET - vx2_SWB - vx2_SWT)/c4o1;
-      cx = (vx3_NEB + vx3_NET - vx3_NWB - vx3_NWT + vx3_SEB + vx3_SET - vx3_SWB - vx3_SWT)/c4o1;
-      axx= (kxxMyyFromfcNEQ_NEB + kxxMyyFromfcNEQ_NET - kxxMyyFromfcNEQ_NWB - kxxMyyFromfcNEQ_NWT + 
-			 kxxMyyFromfcNEQ_SEB + kxxMyyFromfcNEQ_SET - kxxMyyFromfcNEQ_SWB - kxxMyyFromfcNEQ_SWT + 
-			 kxxMzzFromfcNEQ_NEB + kxxMzzFromfcNEQ_NET - kxxMzzFromfcNEQ_NWB - kxxMzzFromfcNEQ_NWT + 
-			 kxxMzzFromfcNEQ_SEB + kxxMzzFromfcNEQ_SET - kxxMzzFromfcNEQ_SWB - kxxMzzFromfcNEQ_SWT + 
-			 c2o1*vx2_NEB + c2o1*vx2_NET - c2o1*vx2_NWB - c2o1*vx2_NWT - 
-			 c2o1*vx2_SEB - c2o1*vx2_SET + c2o1*vx2_SWB + c2o1*vx2_SWT - 
-			 c2o1*vx3_NEB + c2o1*vx3_NET + c2o1*vx3_NWB - c2o1*vx3_NWT - 
-			 c2o1*vx3_SEB + c2o1*vx3_SET + c2o1*vx3_SWB - c2o1*vx3_SWT)/c16o1;
-      bxx= (kxyFromfcNEQ_NEB + kxyFromfcNEQ_NET - kxyFromfcNEQ_NWB - kxyFromfcNEQ_NWT + 
-			 kxyFromfcNEQ_SEB + kxyFromfcNEQ_SET - kxyFromfcNEQ_SWB - kxyFromfcNEQ_SWT - 
-			 c2o1*vx1_NEB - c2o1*vx1_NET + c2o1*vx1_NWB + c2o1*vx1_NWT + 
-			 c2o1*vx1_SEB + c2o1*vx1_SET - c2o1*vx1_SWB - c2o1*vx1_SWT)/c8o1;
-      cxx= (kxzFromfcNEQ_NEB + kxzFromfcNEQ_NET - kxzFromfcNEQ_NWB - kxzFromfcNEQ_NWT + 
-			 kxzFromfcNEQ_SEB + kxzFromfcNEQ_SET - kxzFromfcNEQ_SWB - kxzFromfcNEQ_SWT + 
-			 c2o1*vx1_NEB - c2o1*vx1_NET - c2o1*vx1_NWB + c2o1*vx1_NWT + 
-			 c2o1*vx1_SEB - c2o1*vx1_SET - c2o1*vx1_SWB + c2o1*vx1_SWT)/c8o1;
-      ay = (vx1_NEB + vx1_NET + vx1_NWB + vx1_NWT - vx1_SEB - vx1_SET - vx1_SWB - vx1_SWT)/c4o1;
-      by = (vx2_NEB + vx2_NET + vx2_NWB + vx2_NWT - vx2_SEB - vx2_SET - vx2_SWB - vx2_SWT)/c4o1;
-      cy = (vx3_NEB + vx3_NET + vx3_NWB + vx3_NWT - vx3_SEB - vx3_SET - vx3_SWB - vx3_SWT)/c4o1;
-      ayy= (kxyFromfcNEQ_NEB + kxyFromfcNEQ_NET + kxyFromfcNEQ_NWB + kxyFromfcNEQ_NWT - 
-			 kxyFromfcNEQ_SEB - kxyFromfcNEQ_SET - kxyFromfcNEQ_SWB - kxyFromfcNEQ_SWT - 
-			 c2o1*vx2_NEB - c2o1*vx2_NET + c2o1*vx2_NWB + c2o1*vx2_NWT + 
-			 c2o1*vx2_SEB + c2o1*vx2_SET - c2o1*vx2_SWB - c2o1*vx2_SWT)/c8o1;
-      byy= (-c2o1*kxxMyyFromfcNEQ_NEB - c2o1*kxxMyyFromfcNEQ_NET - c2o1*kxxMyyFromfcNEQ_NWB - c2o1*kxxMyyFromfcNEQ_NWT + 
-			 c2o1*kxxMyyFromfcNEQ_SEB + c2o1*kxxMyyFromfcNEQ_SET + c2o1*kxxMyyFromfcNEQ_SWB + c2o1*kxxMyyFromfcNEQ_SWT + 
-			 kxxMzzFromfcNEQ_NEB + kxxMzzFromfcNEQ_NET + kxxMzzFromfcNEQ_NWB + kxxMzzFromfcNEQ_NWT - 
-			 kxxMzzFromfcNEQ_SEB - kxxMzzFromfcNEQ_SET - kxxMzzFromfcNEQ_SWB - kxxMzzFromfcNEQ_SWT + 
-			 c2o1*vx1_NEB + c2o1*vx1_NET - c2o1*vx1_NWB - c2o1*vx1_NWT - 
-			 c2o1*vx1_SEB - c2o1*vx1_SET + c2o1*vx1_SWB + c2o1*vx1_SWT - 
-			 c2o1*vx3_NEB + c2o1*vx3_NET - c2o1*vx3_NWB + c2o1*vx3_NWT + 
-			 c2o1*vx3_SEB - c2o1*vx3_SET + c2o1*vx3_SWB - c2o1*vx3_SWT)/c16o1;
-      cyy= (kyzFromfcNEQ_NEB + kyzFromfcNEQ_NET + kyzFromfcNEQ_NWB + kyzFromfcNEQ_NWT - 
-			 kyzFromfcNEQ_SEB - kyzFromfcNEQ_SET - kyzFromfcNEQ_SWB - kyzFromfcNEQ_SWT + 
-			 c2o1*vx2_NEB - c2o1*vx2_NET + c2o1*vx2_NWB - c2o1*vx2_NWT - 
-			 c2o1*vx2_SEB + c2o1*vx2_SET - c2o1*vx2_SWB + c2o1*vx2_SWT)/c8o1;
-      az = (-vx1_NEB + vx1_NET - vx1_NWB + vx1_NWT - vx1_SEB + vx1_SET - vx1_SWB + vx1_SWT)/c4o1;
-      bz = (-vx2_NEB + vx2_NET - vx2_NWB + vx2_NWT - vx2_SEB + vx2_SET - vx2_SWB + vx2_SWT)/c4o1;
-      cz = (-vx3_NEB + vx3_NET - vx3_NWB + vx3_NWT - vx3_SEB + vx3_SET - vx3_SWB + vx3_SWT)/c4o1;
-      azz= (-kxzFromfcNEQ_NEB + kxzFromfcNEQ_NET - kxzFromfcNEQ_NWB + kxzFromfcNEQ_NWT - 
-			 kxzFromfcNEQ_SEB + kxzFromfcNEQ_SET - kxzFromfcNEQ_SWB + kxzFromfcNEQ_SWT + 
-			 c2o1*vx3_NEB - c2o1*vx3_NET - c2o1*vx3_NWB + c2o1*vx3_NWT + 
-			 c2o1*vx3_SEB - c2o1*vx3_SET - c2o1*vx3_SWB + c2o1*vx3_SWT)/c8o1;
-      bzz= (-kyzFromfcNEQ_NEB + kyzFromfcNEQ_NET - kyzFromfcNEQ_NWB + kyzFromfcNEQ_NWT - 
-			 kyzFromfcNEQ_SEB + kyzFromfcNEQ_SET - kyzFromfcNEQ_SWB + kyzFromfcNEQ_SWT + 
-			 c2o1*vx3_NEB - c2o1*vx3_NET + c2o1*vx3_NWB - c2o1*vx3_NWT - 
-			 c2o1*vx3_SEB + c2o1*vx3_SET - c2o1*vx3_SWB + c2o1*vx3_SWT)/c8o1;
-      czz= (-kxxMyyFromfcNEQ_NEB + kxxMyyFromfcNEQ_NET - kxxMyyFromfcNEQ_NWB + kxxMyyFromfcNEQ_NWT - 
-			 kxxMyyFromfcNEQ_SEB + kxxMyyFromfcNEQ_SET - kxxMyyFromfcNEQ_SWB + kxxMyyFromfcNEQ_SWT + 
-			 c2o1*kxxMzzFromfcNEQ_NEB - c2o1*kxxMzzFromfcNEQ_NET + c2o1*kxxMzzFromfcNEQ_NWB - c2o1*kxxMzzFromfcNEQ_NWT + 
-			 c2o1*kxxMzzFromfcNEQ_SEB - c2o1*kxxMzzFromfcNEQ_SET + c2o1*kxxMzzFromfcNEQ_SWB - c2o1*kxxMzzFromfcNEQ_SWT - 
-			 c2o1*vx1_NEB + c2o1*vx1_NET + c2o1*vx1_NWB - c2o1*vx1_NWT - 
-			 c2o1*vx1_SEB + c2o1*vx1_SET + c2o1*vx1_SWB - c2o1*vx1_SWT - 
-			 c2o1*vx2_NEB + c2o1*vx2_NET - c2o1*vx2_NWB + c2o1*vx2_NWT + 
-			 c2o1*vx2_SEB - c2o1*vx2_SET + c2o1*vx2_SWB - c2o1*vx2_SWT)/c16o1;
-      axy= (vx1_NEB + vx1_NET - vx1_NWB - vx1_NWT - vx1_SEB - vx1_SET + vx1_SWB + vx1_SWT)/c2o1;
-      bxy= (vx2_NEB + vx2_NET - vx2_NWB - vx2_NWT - vx2_SEB - vx2_SET + vx2_SWB + vx2_SWT)/c2o1;
-      cxy= (vx3_NEB + vx3_NET - vx3_NWB - vx3_NWT - vx3_SEB - vx3_SET + vx3_SWB + vx3_SWT)/c2o1;
-      axz= (-vx1_NEB + vx1_NET + vx1_NWB - vx1_NWT - vx1_SEB + vx1_SET + vx1_SWB - vx1_SWT)/c2o1;
-      bxz= (-vx2_NEB + vx2_NET + vx2_NWB - vx2_NWT - vx2_SEB + vx2_SET + vx2_SWB - vx2_SWT)/c2o1;
-      cxz= (-vx3_NEB + vx3_NET + vx3_NWB - vx3_NWT - vx3_SEB + vx3_SET + vx3_SWB - vx3_SWT)/c2o1;
-      ayz= (-vx1_NEB + vx1_NET - vx1_NWB + vx1_NWT + vx1_SEB - vx1_SET + vx1_SWB - vx1_SWT)/c2o1;
-      byz= (-vx2_NEB + vx2_NET - vx2_NWB + vx2_NWT + vx2_SEB - vx2_SET + vx2_SWB - vx2_SWT)/c2o1;
-      cyz= (-vx3_NEB + vx3_NET - vx3_NWB + vx3_NWT + vx3_SEB - vx3_SET + vx3_SWB - vx3_SWT)/c2o1;
-      //axyz=-vx1_NEB + vx1_NET + vx1_NWB - vx1_NWT + vx1_SEB - vx1_SET - vx1_SWB + vx1_SWT;
-      //bxyz=-vx2_NEB + vx2_NET + vx2_NWB - vx2_NWT + vx2_SEB - vx2_SET - vx2_SWB + vx2_SWT;
-      //cxyz=-vx3_NEB + vx3_NET + vx3_NWB - vx3_NWT + vx3_SEB - vx3_SET - vx3_SWB + vx3_SWT;
-      //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  real kxyAverage	     = c0o1;
-	  real kyzAverage	     = c0o1;
-	  real kxzAverage	     = c0o1;
-	  real kxxMyyAverage	 = c0o1;
-	  real kxxMzzAverage	 = c0o1;
-	  //real kxyAverage	 =(kxyFromfcNEQ_SWB+
-			//				   kxyFromfcNEQ_SWT+
-			//				   kxyFromfcNEQ_SET+
-			//				   kxyFromfcNEQ_SEB+
-			//				   kxyFromfcNEQ_NWB+
-			//				   kxyFromfcNEQ_NWT+
-			//				   kxyFromfcNEQ_NET+
-			//				   kxyFromfcNEQ_NEB)*c1o8-(ay+bx);
-	  //real kyzAverage	 =(kyzFromfcNEQ_SWB+
-			//				   kyzFromfcNEQ_SWT+
-			//				   kyzFromfcNEQ_SET+
-			//				   kyzFromfcNEQ_SEB+
-			//				   kyzFromfcNEQ_NWB+
-			//				   kyzFromfcNEQ_NWT+
-			//				   kyzFromfcNEQ_NET+
-			//				   kyzFromfcNEQ_NEB)*c1o8-(bz+cy);
-	  //real kxzAverage	 =(kxzFromfcNEQ_SWB+
-			//				   kxzFromfcNEQ_SWT+
-			//				   kxzFromfcNEQ_SET+
-			//				   kxzFromfcNEQ_SEB+
-			//				   kxzFromfcNEQ_NWB+
-			//				   kxzFromfcNEQ_NWT+
-			//				   kxzFromfcNEQ_NET+
-			//				   kxzFromfcNEQ_NEB)*c1o8-(az+cx);
-	  //real kxxMyyAverage	 =(kxxMyyFromfcNEQ_SWB+
-			//				   kxxMyyFromfcNEQ_SWT+
-			//				   kxxMyyFromfcNEQ_SET+
-			//				   kxxMyyFromfcNEQ_SEB+
-			//				   kxxMyyFromfcNEQ_NWB+
-			//				   kxxMyyFromfcNEQ_NWT+
-			//				   kxxMyyFromfcNEQ_NET+
-			//				   kxxMyyFromfcNEQ_NEB)*c1o8-(ax-by);
-	  //real kxxMzzAverage	 =(kxxMzzFromfcNEQ_SWB+
-			//				   kxxMzzFromfcNEQ_SWT+
-			//				   kxxMzzFromfcNEQ_SET+
-			//				   kxxMzzFromfcNEQ_SEB+
-			//				   kxxMzzFromfcNEQ_NWB+
-			//				   kxxMzzFromfcNEQ_NWT+
-			//				   kxxMzzFromfcNEQ_NET+
-			//				   kxxMzzFromfcNEQ_NEB)*c1o8-(ax-cz);
-
-
-
-	  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  ////Press
-	  //d0   = ( press_NEB + press_NET + press_NWB + press_NWT + press_SEB + press_SET + press_SWB + press_SWT) * c1o8;
-	  //dx   = ( press_NEB + press_NET - press_NWB - press_NWT + press_SEB + press_SET - press_SWB - press_SWT) * c1o4;
-	  //dy   = ( press_NEB + press_NET + press_NWB + press_NWT - press_SEB - press_SET - press_SWB - press_SWT) * c1o4;
-	  //dz   = (-press_NEB + press_NET - press_NWB + press_NWT - press_SEB + press_SET - press_SWB + press_SWT) * c1o4;
-	  //dxy  = ( press_NEB + press_NET - press_NWB - press_NWT - press_SEB - press_SET + press_SWB + press_SWT) * c1o2;
-	  //dxz  = (-press_NEB + press_NET + press_NWB - press_NWT - press_SEB + press_SET + press_SWB - press_SWT) * c1o2;
-	  //dyz  = (-press_NEB + press_NET - press_NWB + press_NWT + press_SEB - press_SET + press_SWB - press_SWT) * c1o2;
-	  //dxyz =  -press_NEB + press_NET + press_NWB - press_NWT + press_SEB - press_SET - press_SWB + press_SWT;
-	  //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //drho
-	  real LapRho = ((xoff != c0o1) || (yoff != c0o1) || (zoff != c0o1)) ? c0o1 : -c3o1*(ax*ax + by*by + cz*cz) - c6o1 * (bx*ay + cx*az + cy*bz); 
-	  d0   = ( drho_NEB + drho_NET + drho_NWB + drho_NWT + drho_SEB + drho_SET + drho_SWB + drho_SWT - c2o1*LapRho) * c1o8;
-	  dx   = ( drho_NEB + drho_NET - drho_NWB - drho_NWT + drho_SEB + drho_SET - drho_SWB - drho_SWT) * c1o4;
-	  dy   = ( drho_NEB + drho_NET + drho_NWB + drho_NWT - drho_SEB - drho_SET - drho_SWB - drho_SWT) * c1o4;
-	  dz   = (-drho_NEB + drho_NET - drho_NWB + drho_NWT - drho_SEB + drho_SET - drho_SWB + drho_SWT) * c1o4;
-	  dxy  = ( drho_NEB + drho_NET - drho_NWB - drho_NWT - drho_SEB - drho_SET + drho_SWB + drho_SWT) * c1o2;
-	  dxz  = (-drho_NEB + drho_NET + drho_NWB - drho_NWT - drho_SEB + drho_SET + drho_SWB - drho_SWT) * c1o2;
-	  dyz  = (-drho_NEB + drho_NET - drho_NWB + drho_NWT + drho_SEB - drho_SET + drho_SWB - drho_SWT) * c1o2;
-	  //dxyz =  -drho_NEB + drho_NET + drho_NWB - drho_NWT + drho_SEB - drho_SET - drho_SWB + drho_SWT;
-	  //d0   = zero;
-	  //dx   = zero;
-	  //dy   = zero;
-	  //dz   = zero;
-	  //dxy  = zero;
-	  //dxz  = zero;
-	  //dyz  = zero;
-	  //dxyz = zero;
-      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      //
-      // Bernd das Brot 
-	  //
-      //
-	  // x------x
-	  // |      |
-	  // |	 ---+--->X
-	  // |		|  \
-	  // x------x   \
-	  //			off-vector
-      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-      a0 = a0 + xoff * ax + yoff * ay + zoff * az + xoff_sq * axx + yoff_sq * ayy + zoff_sq * azz + xoff*yoff*axy + xoff*zoff*axz + yoff*zoff*ayz;
-      ax = ax + c2o1 * xoff * axx + yoff * axy + zoff * axz;
-      ay = ay + c2o1 * yoff * ayy + xoff * axy + zoff * ayz;
-      az = az + c2o1 * zoff * azz + xoff * axz + yoff * ayz;
-      b0 = b0 + xoff * bx + yoff * by + zoff * bz + xoff_sq * bxx + yoff_sq * byy + zoff_sq * bzz + xoff*yoff*bxy + xoff*zoff*bxz + yoff*zoff*byz;
-      bx = bx + c2o1 * xoff * bxx + yoff * bxy + zoff * bxz;
-      by = by + c2o1 * yoff * byy + xoff * bxy + zoff * byz;
-      bz = bz + c2o1 * zoff * bzz + xoff * bxz + yoff * byz;
-      c0 = c0 + xoff * cx + yoff * cy + zoff * cz + xoff_sq * cxx + yoff_sq * cyy + zoff_sq * czz + xoff*yoff*cxy + xoff*zoff*cxz + yoff*zoff*cyz;
-      cx = cx + c2o1 * xoff * cxx + yoff * cxy + zoff * cxz;
-      cy = cy + c2o1 * yoff * cyy + xoff * cxy + zoff * cyz;
-      cz = cz + c2o1 * zoff * czz + xoff * cxz + yoff * cyz;
-	  d0 = d0 + xoff * dx + yoff * dy + zoff * dz + xoff*yoff*dxy + xoff*zoff*dxz + yoff*zoff*dyz;
-      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //  FIX  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //AAAAAAAAAAAAHHHHHHHHHHHH!!!!! Mieser Test!!!
-	  //b0= bx= by= bz= bxx= byy= bzz= bxy= bxz= byz= c0= cx= cy= cz= cxx= cyy= czz= cxy= cxz= cyz= axyz= bxyz= cxyz=zero;
-	  //b0=zero;
-	  //bx=zero;
-	  //by=zero;
-	  //bz=zero;
-	  //bxx=zero;
-	  //byy=zero;
-	  //bzz=zero;
-	  //bxy=zero;
-	  //bxz=zero;
-	  //byz=zero;
-	  //c0=zero;
-	  //cx=zero;
-	  //cy=zero;
-	  //cz=zero;
-	  //cxx=zero;
-	  //cyy=zero;
-	  //czz=zero;
-	  //cxy=zero;
-	  //cxz=zero;
-	  //cyz=zero;
-	  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////	  
-	  real mfcbb = c0o1;
-	  real mfabb = c0o1;
-	  real mfbcb = c0o1;
-	  real mfbab = c0o1;
-	  real mfbbc = c0o1;
-	  real mfbba = c0o1;
-	  real mfccb = c0o1;
-	  real mfaab = c0o1;
-	  real mfcab = c0o1;
-	  real mfacb = c0o1;
-	  real mfcbc = c0o1;
-	  real mfaba = c0o1;
-	  real mfcba = c0o1;
-	  real mfabc = c0o1;
-	  real mfbcc = c0o1;
-	  real mfbaa = c0o1;
-	  real mfbca = c0o1;
-	  real mfbac = c0o1;
-	  real mfbbb = c0o1;
-	  real mfccc = c0o1;
-	  real mfaac = c0o1;
-	  real mfcac = c0o1;
-	  real mfacc = c0o1;
-	  real mfcca = c0o1;
-	  real mfaaa = c0o1;
-	  real mfcaa = c0o1;
-	  real mfaca = c0o1;
-	  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  real m0, m1, m2, vvx, vvy, vvz, vx2, vy2, vz2, oMdrho;
-	  real mxxPyyPzz, mxxMyy, mxxMzz, mxxyPyzz, mxxyMyzz, mxxzPyyz, mxxzMyyz, mxyyPxzz, mxyyMxzz;
-	  //real qudricLimit = c1o100;//ganz schlechte Idee -> muss global sein
-	  //real O3 = c2o1 - o;
-	  //real residu, residutmp;
-	  //residutmp = c0o1;///*-*/ c2o9 * (1./o - c1o2) * eps_new * eps_new;
-	  real NeqOn = c1o1;//zero;//one;   //.... one = on ..... zero = off 
-	  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-	  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //
-	  //Position C 0., 0., 0.
-	  //
-	  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //x = 0.;
-	  //y = 0.;
-	  //z = 0.;
-	  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //real mxoff = -xoff;
-	  //real myoff = -yoff;
-	  //real mzoff = -zoff;
-	  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-	  //press = press_NET * (c1o8 - c1o4 * mxoff - c1o4 * myoff - c1o4 * mzoff) + 
-			//  press_NWT * (c1o8 + c1o4 * mxoff - c1o4 * myoff - c1o4 * mzoff) + 
-			//  press_SET * (c1o8 - c1o4 * mxoff + c1o4 * myoff - c1o4 * mzoff) + 
-			//  press_SWT * (c1o8 + c1o4 * mxoff + c1o4 * myoff - c1o4 * mzoff) + 
-			//  press_NEB * (c1o8 - c1o4 * mxoff - c1o4 * myoff + c1o4 * mzoff) + 
-			//  press_NWB * (c1o8 + c1o4 * mxoff - c1o4 * myoff + c1o4 * mzoff) + 
-			//  press_SEB * (c1o8 - c1o4 * mxoff + c1o4 * myoff + c1o4 * mzoff) + 
-			//  press_SWB * (c1o8 + c1o4 * mxoff + c1o4 * myoff + c1o4 * mzoff);
-	  //drho  = drho_NET * (c1o8 - c1o4 * xoff - c1o4 * yoff - c1o4 * zoff) + 
-			//  drho_NWT * (c1o8 + c1o4 * xoff - c1o4 * yoff - c1o4 * zoff) + 
-			//  drho_SET * (c1o8 - c1o4 * xoff + c1o4 * yoff - c1o4 * zoff) + 
-			//  drho_SWT * (c1o8 + c1o4 * xoff + c1o4 * yoff - c1o4 * zoff) + 
-			//  drho_NEB * (c1o8 - c1o4 * xoff - c1o4 * yoff + c1o4 * zoff) + 
-			//  drho_NWB * (c1o8 + c1o4 * xoff - c1o4 * yoff + c1o4 * zoff) + 
-			//  drho_SEB * (c1o8 - c1o4 * xoff + c1o4 * yoff + c1o4 * zoff) + 
-			//  drho_SWB * (c1o8 + c1o4 * xoff + c1o4 * yoff + c1o4 * zoff);
-	  press = d0;
-	  vvx   = a0;
-	  vvy   = b0;
-	  vvz   = c0;
-
-	  //mfaaa = drho;
-	  //mfaaa = press + (ax+by+cz)/three;  //  1/3 = 2/3*(1/op-1/2)
-	  mfaaa = press; // if drho is interpolated directly
-
-	  vx2 = vvx*vvx;
-	  vy2 = vvy*vvy;
-	  vz2 = vvz*vvz;
-	  oMdrho = c1o1;
-	  //oMdrho = one - mfaaa;
-
-	  //two
-	  // linear combinations
-	  mxxPyyPzz = mfaaa;
-	  //mxxMyy    = -c2o3*(ax - by)*eps_new/o;
-	  //mxxMzz    = -c2o3*(ax - cz)*eps_new/o;
-
-	  //mfabb     = -c1o3 * (bz + cy)*eps_new/o;
-	  //mfbab     = -c1o3 * (az + cx)*eps_new/o;
-	  //mfbba     = -c1o3 * (ay + bx)*eps_new/o;
-	  mxxMyy    = -c2o3*((ax - by)+kxxMyyAverage)*eps_new/o * (c1o1 + press);
-	  mxxMzz    = -c2o3*((ax - cz)+kxxMzzAverage)*eps_new/o * (c1o1 + press);
-
-	  mfabb     = -c1o3 * ((bz + cy)+kyzAverage)*eps_new/o * (c1o1 + press);
-	  mfbab     = -c1o3 * ((az + cx)+kxzAverage)*eps_new/o * (c1o1 + press);
-	  mfbba     = -c1o3 * ((ay + bx)+kxyAverage)*eps_new/o * (c1o1 + press);
-
-	  
-	  // linear combinations back
-	  mfcaa = c1o3 * (       mxxMyy +       mxxMzz + mxxPyyPzz) * NeqOn;
-	  mfaca = c1o3 * (-c2o1 * mxxMyy +       mxxMzz + mxxPyyPzz) * NeqOn;
-	  mfaac = c1o3 * (       mxxMyy - c2o1 * mxxMzz + mxxPyyPzz) * NeqOn;
-
-	  //3.
-	  // linear combinations
-	  //residu = residutmp * (ayz + bxz + cxy );
-	  //mfbbb = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
-	  mfbbb = c0o1;
-
-	  //residu = residutmp * (axy + two*bxx + two*bzz + cyz );
-	  //residu = -(c1o9*(axy - 2*bxx - 2*bzz + cyz ));
-	  //mxxyPyzz = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
-	  mxxyPyzz = c0o1;
-
-	  //residu = residutmp * (axy + two*bxx - two*bzz - cyz );
-	  //residu = c1o9*(axy - 2*bxx + 2*bzz - cyz );
-	  //mxxyMyzz = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
-	  mxxyMyzz = c0o1;
-
-	  //residu = residutmp * (axz + byz + two*cxx + two*cyy );
-	  //residu = -(c1o9*(axz + byz - 2*cxx - 2*cyy ));
-	  //mxxzPyyz = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
-	  mxxzPyyz = c0o1;
-
-	  //residu = residutmp * (axz - byz + two*cxx - two*cyy );
-	  //residu = c1o9*(axz - byz - 2*cxx + 2*cyy );
-	  //mxxzMyyz = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
-	  mxxzMyyz = c0o1;
-
-	  //residu = residutmp * (two*ayy + two*azz + bxy + cxz );
-	  //residu = c1o9*(2*ayy + 2*azz - bxy - cxz );
-	  //mxyyPxzz = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
-	  mxyyPxzz = c0o1;
-
-	  //residu = residutmp * (two*ayy - two*azz + bxy - cxz );
-	  //residu = c1o9*(-2*ayy + 2*azz + bxy - cxz );
-	  //mxyyMxzz = (abs(residu)+qudricLimit) * residu / (qudricLimit * O3 + abs(residu));
-	  mxyyMxzz = c0o1;
-
-	  // linear combinations back
-	  mfcba = ( mxxyMyzz + mxxyPyzz) * c1o2;
-	  mfabc = (-mxxyMyzz + mxxyPyzz) * c1o2;
-	  mfcab = ( mxxzMyyz + mxxzPyyz) * c1o2;
-	  mfacb = (-mxxzMyyz + mxxzPyyz) * c1o2;
-	  mfbca = ( mxyyMxzz + mxyyPxzz) * c1o2;
-	  mfbac = (-mxyyMxzz + mxyyPxzz) * c1o2;
-
-	  //4.
-	  mfacc = mfaaa*c1o9; 
-	  mfcac = mfacc; 
-	  mfcca = mfacc; 
-	  //5.
-
-	  //6.
-	  mfccc = mfaaa*c1o27;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  //back
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  //mit 1, 0, 1/3, 0, 0, 0, 1/3, 0, 1/9   Konditionieren
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  // Z - Dir
-	  m0 =  mfaac * c1o2 +      mfaab * (vvz - c1o2) + (mfaaa + c1o1 * oMdrho) * (     vz2 - vvz) * c1o2; 
-	  m1 = -mfaac        - c2o1 * mfaab *  vvz         +  mfaaa                * (c1o1 - vz2)              - c1o1 * oMdrho * vz2; 
-	  m2 =  mfaac * c1o2 +      mfaab * (vvz + c1o2) + (mfaaa + c1o1 * oMdrho) * (     vz2 + vvz) * c1o2;
-	  mfaaa = m0;
-	  mfaab = m1;
-	  mfaac = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  m0 =  mfabc * c1o2 +      mfabb * (vvz - c1o2) + mfaba * (     vz2 - vvz) * c1o2; 
-	  m1 = -mfabc        - c2o1 * mfabb *  vvz         + mfaba * (c1o1 - vz2); 
-	  m2 =  mfabc * c1o2 +      mfabb * (vvz + c1o2) + mfaba * (     vz2 + vvz) * c1o2;
-	  mfaba = m0;
-	  mfabb = m1;
-	  mfabc = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  m0 =  mfacc * c1o2 +      mfacb * (vvz - c1o2) + (mfaca + c1o3 * oMdrho) * (     vz2 - vvz) * c1o2; 
-	  m1 = -mfacc        - c2o1 * mfacb *  vvz         +  mfaca                  * (c1o1 - vz2)              - c1o3 * oMdrho * vz2; 
-	  m2 =  mfacc * c1o2 +      mfacb * (vvz + c1o2) + (mfaca + c1o3 * oMdrho) * (     vz2 + vvz) * c1o2;
-	  mfaca = m0;
-	  mfacb = m1;
-	  mfacc = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  m0 =  mfbac * c1o2 +      mfbab * (vvz - c1o2) + mfbaa * (     vz2 - vvz) * c1o2; 
-	  m1 = -mfbac        - c2o1 * mfbab *  vvz         + mfbaa * (c1o1 - vz2); 
-	  m2 =  mfbac * c1o2 +      mfbab * (vvz + c1o2) + mfbaa * (     vz2 + vvz) * c1o2;
-	  mfbaa = m0;
-	  mfbab = m1;
-	  mfbac = m2;
-	  /////////b//////////////////////////////////////////////////////////////////////////
-	  m0 =  mfbbc * c1o2 +      mfbbb * (vvz - c1o2) + mfbba * (     vz2 - vvz) * c1o2; 
-	  m1 = -mfbbc        - c2o1 * mfbbb *  vvz         + mfbba * (c1o1 - vz2); 
-	  m2 =  mfbbc * c1o2 +      mfbbb * (vvz + c1o2) + mfbba * (     vz2 + vvz) * c1o2;
-	  mfbba = m0;
-	  mfbbb = m1;
-	  mfbbc = m2;
-	  /////////b//////////////////////////////////////////////////////////////////////////
-	  m0 =  mfbcc * c1o2 +      mfbcb * (vvz - c1o2) + mfbca * (     vz2 - vvz) * c1o2; 
-	  m1 = -mfbcc        - c2o1 * mfbcb *  vvz         + mfbca * (c1o1 - vz2); 
-	  m2 =  mfbcc * c1o2 +      mfbcb * (vvz + c1o2) + mfbca * (     vz2 + vvz) * c1o2;
-	  mfbca = m0;
-	  mfbcb = m1;
-	  mfbcc = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  m0 =  mfcac * c1o2 +      mfcab * (vvz - c1o2) + (mfcaa + c1o3 * oMdrho) * (     vz2 - vvz) * c1o2; 
-	  m1 = -mfcac        - c2o1 * mfcab *  vvz         +  mfcaa                  * (c1o1 - vz2)              - c1o3 * oMdrho * vz2; 
-	  m2 =  mfcac * c1o2 +      mfcab * (vvz + c1o2) + (mfcaa + c1o3 * oMdrho) * (     vz2 + vvz) * c1o2;
-	  mfcaa = m0;
-	  mfcab = m1;
-	  mfcac = m2;
-	  /////////c//////////////////////////////////////////////////////////////////////////
-	  m0 =  mfcbc * c1o2 +      mfcbb * (vvz - c1o2) + mfcba * (     vz2 - vvz) * c1o2; 
-	  m1 = -mfcbc        - c2o1 * mfcbb *  vvz         + mfcba * (c1o1 - vz2); 
-	  m2 =  mfcbc * c1o2 +      mfcbb * (vvz + c1o2) + mfcba * (     vz2 + vvz) * c1o2;
-	  mfcba = m0;
-	  mfcbb = m1;
-	  mfcbc = m2;
-	  /////////c//////////////////////////////////////////////////////////////////////////
-	  m0 =  mfccc * c1o2 +      mfccb * (vvz - c1o2) + (mfcca + c1o9 * oMdrho) * (     vz2 - vvz) * c1o2; 
-	  m1 = -mfccc        - c2o1 * mfccb *  vvz         +  mfcca                  * (c1o1 - vz2)              - c1o9 * oMdrho * vz2; 
-	  m2 =  mfccc * c1o2 +      mfccb * (vvz + c1o2) + (mfcca + c1o9 * oMdrho) * (     vz2 + vvz) * c1o2;
-	  mfcca = m0;
-	  mfccb = m1;
-	  mfccc = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  //mit 1/6, 2/3, 1/6, 0, 0, 0, 1/18, 2/9, 1/18   Konditionieren
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  // Y - Dir
-	  m0 =  mfaca * c1o2 +      mfaba * (vvy - c1o2) + (mfaaa + c1o6 * oMdrho) * (     vy2 - vvy) * c1o2; 
-	  m1 = -mfaca        - c2o1 * mfaba *  vvy         +  mfaaa                  * (c1o1 - vy2)              - c1o6 * oMdrho * vy2; 
-	  m2 =  mfaca * c1o2 +      mfaba * (vvy + c1o2) + (mfaaa + c1o6 * oMdrho) * (     vy2 + vvy) * c1o2;
-	  mfaaa = m0;
-	  mfaba = m1;
-	  mfaca = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  m0 =  mfacb * c1o2 +      mfabb * (vvy - c1o2) + (mfaab + c2o3 * oMdrho) * (     vy2 - vvy) * c1o2; 
-	  m1 = -mfacb        - c2o1 * mfabb *  vvy         +  mfaab                  * (c1o1 - vy2)              - c2o3 * oMdrho * vy2; 
-	  m2 =  mfacb * c1o2 +      mfabb * (vvy + c1o2) + (mfaab + c2o3 * oMdrho) * (     vy2 + vvy) * c1o2;
-	  mfaab = m0;
-	  mfabb = m1;
-	  mfacb = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  m0 =  mfacc * c1o2 +      mfabc * (vvy - c1o2) + (mfaac + c1o6 * oMdrho) * (     vy2 - vvy) * c1o2; 
-	  m1 = -mfacc        - c2o1 * mfabc *  vvy         +  mfaac                  * (c1o1 - vy2)              - c1o6 * oMdrho * vy2; 
-	  m2 =  mfacc * c1o2 +      mfabc * (vvy + c1o2) + (mfaac + c1o6 * oMdrho) * (     vy2 + vvy) * c1o2;
-	  mfaac = m0;
-	  mfabc = m1;
-	  mfacc = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  m0 =  mfbca * c1o2 +      mfbba * (vvy - c1o2) + mfbaa * (     vy2 - vvy) * c1o2; 
-	  m1 = -mfbca        - c2o1 * mfbba *  vvy         + mfbaa * (c1o1 - vy2); 
-	  m2 =  mfbca * c1o2 +      mfbba * (vvy + c1o2) + mfbaa * (     vy2 + vvy) * c1o2;
-	  mfbaa = m0;
-	  mfbba = m1;
-	  mfbca = m2;
-	  /////////b//////////////////////////////////////////////////////////////////////////
-	  m0 =  mfbcb * c1o2 +      mfbbb * (vvy - c1o2) + mfbab * (     vy2 - vvy) * c1o2; 
-	  m1 = -mfbcb        - c2o1 * mfbbb *  vvy         + mfbab * (c1o1 - vy2); 
-	  m2 =  mfbcb * c1o2 +      mfbbb * (vvy + c1o2) + mfbab * (     vy2 + vvy) * c1o2;
-	  mfbab = m0;
-	  mfbbb = m1;
-	  mfbcb = m2;
-	  /////////b//////////////////////////////////////////////////////////////////////////
-	  m0 =  mfbcc * c1o2 +      mfbbc * (vvy - c1o2) + mfbac * (     vy2 - vvy) * c1o2; 
-	  m1 = -mfbcc        - c2o1 * mfbbc *  vvy         + mfbac * (c1o1 - vy2); 
-	  m2 =  mfbcc * c1o2 +      mfbbc * (vvy + c1o2) + mfbac * (     vy2 + vvy) * c1o2;
-	  mfbac = m0;
-	  mfbbc = m1;
-	  mfbcc = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  m0 =  mfcca * c1o2 +      mfcba * (vvy - c1o2) + (mfcaa + c1o18 * oMdrho) * (     vy2 - vvy) * c1o2; 
-	  m1 = -mfcca        - c2o1 * mfcba *  vvy         +  mfcaa                   * (c1o1 - vy2)              - c1o18 * oMdrho * vy2; 
-	  m2 =  mfcca * c1o2 +      mfcba * (vvy + c1o2) + (mfcaa + c1o18 * oMdrho) * (     vy2 + vvy) * c1o2;
-	  mfcaa = m0;
-	  mfcba = m1;
-	  mfcca = m2;
-	  /////////c//////////////////////////////////////////////////////////////////////////
-	  m0 =  mfccb * c1o2 +      mfcbb * (vvy - c1o2) + (mfcab + c2o9 * oMdrho) * (     vy2 - vvy) * c1o2; 
-	  m1 = -mfccb        - c2o1 * mfcbb *  vvy         +  mfcab                  * (c1o1 - vy2)              - c2o9 * oMdrho * vy2; 
-	  m2 =  mfccb * c1o2 +      mfcbb * (vvy + c1o2) + (mfcab + c2o9 * oMdrho) * (     vy2 + vvy) * c1o2;
-	  mfcab = m0;
-	  mfcbb = m1;
-	  mfccb = m2;
-	  /////////c//////////////////////////////////////////////////////////////////////////
-	  m0 =  mfccc * c1o2 +      mfcbc * (vvy - c1o2) + (mfcac + c1o18 * oMdrho) * (     vy2 - vvy) * c1o2; 
-	  m1 = -mfccc        - c2o1 * mfcbc *  vvy         +  mfcac                   * (c1o1 - vy2)              - c1o18 * oMdrho * vy2; 
-	  m2 =  mfccc * c1o2 +      mfcbc * (vvy + c1o2) + (mfcac + c1o18 * oMdrho) * (     vy2 + vvy) * c1o2;
-	  mfcac = m0;
-	  mfcbc = m1;
-	  mfccc = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  //mit 1/36, 1/9, 1/36, 1/9, 4/9, 1/9, 1/36, 1/9, 1/36 Konditionieren
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  // X - Dir
-	  m0 =  mfcaa * c1o2 +      mfbaa * (vvx - c1o2) + (mfaaa + c1o36 * oMdrho) * (     vx2 - vvx) * c1o2; 
-	  m1 = -mfcaa        - c2o1 * mfbaa *  vvx         +  mfaaa                   * (c1o1 - vx2)              - c1o36 * oMdrho * vx2; 
-	  m2 =  mfcaa * c1o2 +      mfbaa * (vvx + c1o2) + (mfaaa + c1o36 * oMdrho) * (     vx2 + vvx) * c1o2;
-	  mfaaa = m0;
-	  mfbaa = m1;
-	  mfcaa = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  m0 =  mfcba * c1o2 +      mfbba * (vvx - c1o2) + (mfaba + c1o9 * oMdrho) * (     vx2 - vvx) * c1o2; 
-	  m1 = -mfcba        - c2o1 * mfbba *  vvx         +  mfaba                  * (c1o1 - vx2)              - c1o9 * oMdrho * vx2; 
-	  m2 =  mfcba * c1o2 +      mfbba * (vvx + c1o2) + (mfaba + c1o9 * oMdrho) * (     vx2 + vvx) * c1o2;
-	  mfaba = m0;
-	  mfbba = m1;
-	  mfcba = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  m0 =  mfcca * c1o2 +      mfbca * (vvx - c1o2) + (mfaca + c1o36 * oMdrho) * (     vx2 - vvx) * c1o2; 
-	  m1 = -mfcca        - c2o1 * mfbca *  vvx         +  mfaca                   * (c1o1 - vx2)              - c1o36 * oMdrho * vx2; 
-	  m2 =  mfcca * c1o2 +      mfbca * (vvx + c1o2) + (mfaca + c1o36 * oMdrho) * (     vx2 + vvx) * c1o2;
-	  mfaca = m0;
-	  mfbca = m1;
-	  mfcca = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  m0 =  mfcab * c1o2 +      mfbab * (vvx - c1o2) + (mfaab + c1o9 * oMdrho) * (     vx2 - vvx) * c1o2; 
-	  m1 = -mfcab        - c2o1 * mfbab *  vvx         +  mfaab                  * (c1o1 - vx2)              - c1o9 * oMdrho * vx2; 
-	  m2 =  mfcab * c1o2 +      mfbab * (vvx + c1o2) + (mfaab + c1o9 * oMdrho) * (     vx2 + vvx) * c1o2;
-	  mfaab = m0;
-	  mfbab = m1;
-	  mfcab = m2;
-	  ///////////b////////////////////////////////////////////////////////////////////////
-	  m0 =  mfcbb * c1o2 +      mfbbb * (vvx - c1o2) + (mfabb + c4o9 * oMdrho) * (     vx2 - vvx) * c1o2; 
-	  m1 = -mfcbb        - c2o1 * mfbbb *  vvx         +  mfabb                  * (c1o1 - vx2)              - c4o9 * oMdrho * vx2; 
-	  m2 =  mfcbb * c1o2 +      mfbbb * (vvx + c1o2) + (mfabb + c4o9 * oMdrho) * (     vx2 + vvx) * c1o2;
-	  mfabb = m0;
-	  mfbbb = m1;
-	  mfcbb = m2;
-	  ///////////b////////////////////////////////////////////////////////////////////////
-	  m0 =  mfccb * c1o2 +      mfbcb * (vvx - c1o2) + (mfacb + c1o9 * oMdrho) * (     vx2 - vvx) * c1o2; 
-	  m1 = -mfccb        - c2o1 * mfbcb *  vvx         +  mfacb                  * (c1o1 - vx2)              - c1o9 * oMdrho * vx2; 
-	  m2 =  mfccb * c1o2 +      mfbcb * (vvx + c1o2) + (mfacb + c1o9 * oMdrho) * (     vx2 + vvx) * c1o2;
-	  mfacb = m0;
-	  mfbcb = m1;
-	  mfccb = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  m0 =  mfcac * c1o2 +      mfbac * (vvx - c1o2) + (mfaac + c1o36 * oMdrho) * (     vx2 - vvx) * c1o2; 
-	  m1 = -mfcac        - c2o1 * mfbac *  vvx         +  mfaac                   * (c1o1 - vx2)              - c1o36 * oMdrho * vx2; 
-	  m2 =  mfcac * c1o2 +      mfbac * (vvx + c1o2) + (mfaac + c1o36 * oMdrho) * (     vx2 + vvx) * c1o2;
-	  mfaac = m0;
-	  mfbac = m1;
-	  mfcac = m2;
-	  ///////////c////////////////////////////////////////////////////////////////////////
-	  m0 =  mfcbc * c1o2 +      mfbbc * (vvx - c1o2) + (mfabc + c1o9 * oMdrho) * (     vx2 - vvx) * c1o2; 
-	  m1 = -mfcbc        - c2o1 * mfbbc *  vvx         +  mfabc                  * (c1o1 - vx2)              - c1o9 * oMdrho * vx2; 
-	  m2 =  mfcbc * c1o2 +      mfbbc * (vvx + c1o2) + (mfabc + c1o9 * oMdrho) * (     vx2 + vvx) * c1o2;
-	  mfabc = m0;
-	  mfbbc = m1;
-	  mfcbc = m2;
-	  ///////////c////////////////////////////////////////////////////////////////////////
-	  m0 =  mfccc * c1o2 +      mfbcc * (vvx - c1o2) + (mfacc + c1o36 * oMdrho) * (     vx2 - vvx) * c1o2; 
-	  m1 = -mfccc        - c2o1 * mfbcc *  vvx         +  mfacc                   * (c1o1 - vx2)              - c1o36 * oMdrho * vx2; 
-	  m2 =  mfccc * c1o2 +      mfbcc * (vvx + c1o2) + (mfacc + c1o36 * oMdrho) * (     vx2 + vvx) * c1o2;
-	  mfacc = m0;
-	  mfbcc = m1;
-	  mfccc = m2;
-	  ////////////////////////////////////////////////////////////////////////////////////
-
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  //index 0
-	  kzero= posC[k];
-	  kw   = neighborCX[kzero];
-	  ks   = neighborCY[kzero];
-	  kb   = neighborCZ[kzero];
-	  ksw  = neighborCY[kw];
-	  kbw  = neighborCZ[kw];
-	  kbs  = neighborCZ[ks];
-	  kbsw = neighborCZ[ksw];
-	  ////////////////////////////////////////////////////////////////////////////////////
-
-
-	  ////////////////////////////////////////////////////////////////////////////////////
-	  feC[kzero]    = mfcbb;                                                                 
-	  fwC[kw]       = mfabb;                                                               
-	  fnC[kzero]    = mfbcb;
-	  fsC[ks]       = mfbab;
-	  ftC[kzero]    = mfbbc;
-	  fbC[kb]       = mfbba;
-	  fneC[kzero]   = mfccb;
-	  fswC[ksw]     = mfaab;
-	  fseC[ks]      = mfcab;
-	  fnwC[kw]      = mfacb;
-	  fteC[kzero]   = mfcbc;
-	  fbwC[kbw]     = mfaba;
-	  fbeC[kb]      = mfcba;
-	  ftwC[kw]      = mfabc;
-	  ftnC[kzero]   = mfbcc;
-	  fbsC[kbs]     = mfbaa;
-	  fbnC[kb]      = mfbca;
-	  ftsC[ks]      = mfbac;
-	  fzeroC[kzero] = mfbbb;
-	  ftneC[kzero]  = mfccc;
-	  ftseC[ks]     = mfcac;
-	  fbneC[kb]     = mfcca;
-	  fbseC[kbs]    = mfcaa;
-	  ftnwC[kw]     = mfacc;
-	  ftswC[ksw]    = mfaac;
-	  fbnwC[kbw]    = mfaca;
-	  fbswC[kbsw]   = mfaaa;
-	  ////////////////////////////////////////////////////////////////////////////////////
-   }
+   scaleFC_RhoSq_comp_27_Calculation(DC, DF, neighborCX, neighborCY, neighborCZ, neighborFX, neighborFY, neighborFZ,
+                                     size_MatC, size_MatF, evenOrOdd, posC, posFSWB, kFC, omCoarse, omFine, nu, nxC,
+                                     nyC, nxF, nyF, offFC, k);
 }
-//////////////////////////////////////////////////////////////////////////
 
 
 
diff --git a/src/gpu/VirtualFluids_GPU/GPU/TurbulenceIntensity.cu b/src/gpu/VirtualFluids_GPU/GPU/TurbulenceIntensity.cu
new file mode 100644
index 0000000000000000000000000000000000000000..42ac0cd4ffc6da19e67f88cbf430677dcfa8a826
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/GPU/TurbulenceIntensity.cu
@@ -0,0 +1,73 @@
+//  _    ___      __              __________      _     __        ______________   __
+// | |  / (_)____/ /___  ______ _/ / ____/ /_  __(_)___/ /____   /  ___/ __  / /  / /
+// | | / / / ___/ __/ / / / __ `/ / /_  / / / / / / __  / ___/  / /___/ /_/ / /  / /
+// | |/ / / /  / /_/ /_/ / /_/ / / __/ / / /_/ / / /_/ (__  )  / /_) / ____/ /__/ / 
+// |___/_/_/   \__/\__,_/\__,_/_/_/   /_/\__,_/_/\__,_/____/   \____/_/    \_____/
+//
+//////////////////////////////////////////////////////////////////////////
+
+/* Device code */
+#include "LBM/LB.h" 
+#include "LBM/D3Q27.h"
+#include <lbm/constants/NumericConstants.h>
+
+using namespace vf::lbm::constant;
+#include "lbm/MacroscopicQuantities.h"
+#include "../Kernel/Utilities/DistributionHelper.cuh"
+
+
+using namespace vf::lbm::constant;
+
+//////////////////////////////////////////////////////////////////////////////
+extern "C" __global__ void CalcTurbulenceIntensity(
+   real* vxx,
+   real* vyy,
+   real* vzz,
+   real* vxy,
+   real* vxz,
+   real* vyz,
+   real* vx_mean,
+   real* vy_mean,
+   real* vz_mean, 
+   real *distributions, 
+   uint* typeOfGridNode, 
+   unsigned int* neighborX,
+   unsigned int* neighborY,
+   unsigned int* neighborZ,
+   unsigned int size_Mat, 
+   bool isEvenTimestep)
+{
+   const unsigned k = vf::gpu::getNodeIndex();
+
+   if (k >= size_Mat)
+       return;
+
+   if (!vf::gpu::isValidFluidNode(typeOfGridNode[k]))
+       return;
+
+   vf::gpu::DistributionWrapper distr_wrapper(distributions, size_Mat, isEvenTimestep, k, neighborX, neighborY,
+                                              neighborZ);
+   const auto &distribution = distr_wrapper.distribution;
+
+   // analogue to LBCalcMacCompSP27
+   real rho   = vf::lbm::getDensity(distribution.f);
+   real vx    = vf::lbm::getCompressibleVelocityX1(distribution.f, rho);
+   real vy    = vf::lbm::getCompressibleVelocityX2(distribution.f, rho);
+   real vz    = vf::lbm::getCompressibleVelocityX3(distribution.f, rho);   
+
+
+   // compute subtotals:
+   // fluctuations
+   vxx[k] = vxx[k] + vx * vx;
+   vyy[k] = vyy[k] + vy * vy;
+   vzz[k] = vzz[k] + vz * vz;
+   vxy[k] = vxy[k] + vx * vy;
+   vxz[k] = vxz[k] + vx * vz;
+   vyz[k] = vyz[k] + vy * vz;
+
+   // velocity (for mean velocity)
+   vx_mean[k] = vx_mean[k] + vx;
+   vy_mean[k] = vy_mean[k] + vy;
+   vz_mean[k] = vz_mean[k] + vz; 
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h
index 4b54277ddd405eb619191895065af9bb3b780063..9f9f7539bc5a1e28612d956ca32234c5a3589f8a 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernel.h
@@ -13,6 +13,7 @@ class Kernel
 public:
     virtual ~Kernel()  = default;
     virtual void run() = 0;
+    virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1) = 0; //if stream == -1: run on default stream
 
     virtual bool checkParameter()                                = 0;
     virtual std::vector<PreProcessorType> getPreProcessorTypes() = 0;
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp
index 5e4c5aa08e37e88008da13466bfeed6893ec94f6..3151e6bedeb6a96666f11f0040de2c95b20cc42c 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp
+++ b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.cpp
@@ -2,8 +2,13 @@
 
 #include "Kernel/Utilities/CheckParameterStrategy/CheckParameterStrategy.h"
 
-bool KernelImp::checkParameter() 
-{ 
+
+void KernelImp::runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream)
+{
+    printf("Method not implemented for this Kernel \n");
+}
+
+bool KernelImp::checkParameter() { 
     return checkStrategy->checkParameter(para);
 }
 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h
index 47f689b7b3c88a6c7591454909cc6875384908c1..cba3540905df0314d6ce1eb6f0a1eab8d4a5a4c4 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h
+++ b/src/gpu/VirtualFluids_GPU/Kernel/KernelImp.h
@@ -14,6 +14,7 @@ class KernelImp : public Kernel
 {
 public:
     virtual void run() = 0;
+    virtual void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1);
 
     bool checkParameter();
     std::vector<PreProcessorType> getPreProcessorTypes();
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp.cu
index 86704c6143c2d48ee070dad58e12f49036dea43d..2b3b72a6888e62ccac1009d2f1ece14b96bf93be 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp.cu
@@ -2,6 +2,7 @@
 
 #include "Parameter/Parameter.h"
 #include "CumulantK17Comp_Device.cuh"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<CumulantK17Comp> CumulantK17Comp::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,25 +11,7 @@ std::shared_ptr<CumulantK17Comp> CumulantK17Comp::getNewInstance(std::shared_ptr
 
 void CumulantK17Comp::run()
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->size_Mat_SP;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_CumulantK17Comp <<< grid, threads >>>(para->getParD(level)->omega,
+	LB_Kernel_CumulantK17Comp <<< cudaGrid.grid, cudaGrid.threads >>>(para->getParD(level)->omega,
 													para->getParD(level)->geoSP,
 													para->getParD(level)->neighborX_SP,
 													para->getParD(level)->neighborY_SP,
@@ -42,12 +25,9 @@ void CumulantK17Comp::run()
 	getLastCudaError("LB_Kernel_CumulantK17Comp execution failed");
 }
 
-CumulantK17Comp::CumulantK17Comp(std::shared_ptr<Parameter> para, int level)
+CumulantK17Comp::CumulantK17Comp(std::shared_ptr<Parameter> para, int level): KernelImp(para, level)
 {
-	this->para = para;
-	this->level = level;
-
 	myPreProcessorTypes.push_back(InitCompSP27);
-
 	myKernelGroup = BasicKernel;
+	this->cudaGrid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->size_Mat_SP);
 }
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chim/CumulantK17CompChim.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chim/CumulantK17CompChim.cu
index b97b2778440b7d1ab32b3d0e9bb002a48df03134..09db3da401edbb4be2a3e3409dac9138c6fad4ad 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chim/CumulantK17CompChim.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chim/CumulantK17CompChim.cu
@@ -2,6 +2,7 @@
 
 #include "Parameter/Parameter.h"
 #include "CumulantK17CompChim_Device.cuh"
+#include "cuda/CudaGrid.h"
 
 std::shared_ptr<CumulantK17CompChim> CumulantK17CompChim::getNewInstance(std::shared_ptr<Parameter> para, int level)
 {
@@ -10,25 +11,7 @@ std::shared_ptr<CumulantK17CompChim> CumulantK17CompChim::getNewInstance(std::sh
 
 void CumulantK17CompChim::run()
 {
-	int numberOfThreads = para->getParD(level)->numberofthreads;
-	int size_Mat = para->getParD(level)->size_Mat_SP;
-
-	int Grid = (size_Mat / numberOfThreads) + 1;
-	int Grid1, Grid2;
-	if (Grid>512)
-	{
-		Grid1 = 512;
-		Grid2 = (Grid / Grid1) + 1;
-	}
-	else
-	{
-		Grid1 = 1;
-		Grid2 = Grid;
-	}
-	dim3 grid(Grid1, Grid2);
-	dim3 threads(numberOfThreads, 1, 1);
-
-	LB_Kernel_CumulantK17CompChim <<< grid, threads >>>(
+	LB_Kernel_CumulantK17CompChim <<< cudaGrid.grid, cudaGrid.threads >>>(
 		para->getParD(level)->omega,
 		para->getParD(level)->geoSP,
 		para->getParD(level)->neighborX_SP,
@@ -47,12 +30,9 @@ void CumulantK17CompChim::run()
 	getLastCudaError("LB_Kernel_CumulantK17CompChim execution failed");
 }
 
-CumulantK17CompChim::CumulantK17CompChim(std::shared_ptr<Parameter> para, int level)
+CumulantK17CompChim::CumulantK17CompChim(std::shared_ptr<Parameter> para, int level): KernelImp(para, level)
 {
-	this->para = para;
-	this->level = level;
-
 	myPreProcessorTypes.push_back(InitCompSP27);
-
 	myKernelGroup = BasicKernel;
+	this->cudaGrid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->size_Mat_SP);
 }
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu
new file mode 100644
index 0000000000000000000000000000000000000000..255452dbc016fdc732277e17f9736d3713db719a
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.cu
@@ -0,0 +1,60 @@
+#include "CumulantK17CompChimStream.h"
+
+#include "Parameter/Parameter.h"
+#include "Parameter/CudaStreamManager.h"
+#include "CumulantK17CompChimStream_Device.cuh"
+
+#include <cuda.h>
+
+std::shared_ptr<CumulantK17CompChimStream> CumulantK17CompChimStream::getNewInstance(std::shared_ptr<Parameter> para,
+                                                                               int level)
+{
+    return std::shared_ptr<CumulantK17CompChimStream>(new CumulantK17CompChimStream(para, level));
+}
+
+void CumulantK17CompChimStream::run()
+{
+	LB_Kernel_CumulantK17CompChimStream <<< cudaGrid.grid, cudaGrid.threads >>>(
+		para->getParD(level)->omega,
+		para->getParD(level)->neighborX_SP,
+		para->getParD(level)->neighborY_SP,
+		para->getParD(level)->neighborZ_SP,
+		para->getParD(level)->d0SP.f[0],
+		para->getParD(level)->size_Mat_SP,
+		level,
+		para->getForcesDev(),
+        para->getQuadricLimitersDev(),
+		para->getParD(level)->evenOrOdd,
+        para->getParD(level)->fluidNodeIndices,
+		para->getParD(level)->numberOfFluidNodes);
+	getLastCudaError("LB_Kernel_CumulantK17CompChim execution failed");
+}
+
+void CumulantK17CompChimStream::runOnIndices(const unsigned int *indices, unsigned int size_indices, int streamIndex)
+{
+    cudaStream_t stream = (streamIndex == -1) ? CU_STREAM_LEGACY : para->getStreamManager()->getStream(streamIndex);
+
+    LB_Kernel_CumulantK17CompChimStream<<< cudaGrid.grid, cudaGrid.threads, 0, stream>>>(
+        para->getParD(level)->omega, 
+	    para->getParD(level)->neighborX_SP, 
+	    para->getParD(level)->neighborY_SP,
+        para->getParD(level)->neighborZ_SP, 
+	    para->getParD(level)->d0SP.f[0], 
+	    para->getParD(level)->size_Mat_SP, 
+	    level,
+        para->getForcesDev(), 
+	    para->getQuadricLimitersDev(), 
+	    para->getParD(level)->evenOrOdd,
+        indices,
+	    size_indices);
+    getLastCudaError("LB_Kernel_CumulantK17CompChim execution failed");
+    
+}
+
+CumulantK17CompChimStream::CumulantK17CompChimStream(std::shared_ptr<Parameter> para, int level): KernelImp(para, level)
+{
+	myPreProcessorTypes.push_back(InitCompSP27);
+	myKernelGroup = BasicKernel;
+	this->cudaGrid = vf::cuda::CudaGrid(para->getParD(level)->numberofthreads, para->getParD(level)->size_Mat_SP);
+}
+
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.h b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..325826e04c893b7c56b7f00bb2503a4eb1fda441
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.h
@@ -0,0 +1,18 @@
+#ifndef CUMULANT_K17_COMP_CHIM_SPARSE_H
+#define CUMULANT_K17_COMP_CHIM_SPARSE_H
+
+#include "Kernel/KernelImp.h"
+
+class CumulantK17CompChimStream : public KernelImp
+{
+public:
+    static std::shared_ptr<CumulantK17CompChimStream> getNewInstance(std::shared_ptr<Parameter> para, int level);
+	void run() override;
+    void runOnIndices(const unsigned int *indices, unsigned int size_indices, int stream = -1) override;
+
+private:
+    CumulantK17CompChimStream();
+    CumulantK17CompChimStream(std::shared_ptr<Parameter> para, int level);
+};
+
+#endif 
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStreamDevice.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStreamDevice.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f57fd9dd9bc2a372c7790bf8f3837e69d1d52beb
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStreamDevice.cu
@@ -0,0 +1,639 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Cumulant27chimStream.cu
+//! \ingroup GPU
+//! \author Martin Schoenherr, Anna Wellmann
+//=======================================================================================
+/* Device code */
+#include "LBM/LB.h" 
+#include "LBM/D3Q27.h"
+#include <lbm/constants/NumericConstants.h>
+
+using namespace vf::lbm::constant;
+#include "Kernel/ChimeraTransformation.h"
+
+////////////////////////////////////////////////////////////////////////////////
+extern "C" __global__ void LB_Kernel_CumulantK17CompChimStream(
+	real omega,
+	uint* neighborX,
+	uint* neighborY,
+	uint* neighborZ,
+	real* distributions,
+	unsigned long size_Mat,
+	int level,
+	real* forces,
+	real* quadricLimiters,
+	bool isEvenTimestep,
+    const uint *fluidNodeIndices, 
+    uint numberOfFluidNodes)
+{
+    //////////////////////////////////////////////////////////////////////////
+    //! Cumulant K17 Kernel is based on \ref
+    //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
+    //! ]</b></a> and \ref <a href="https://doi.org/10.1016/j.jcp.2017.07.004"><b>[ M. Geier et al. (2017),
+    //! DOI:10.1016/j.jcp.2017.07.004 ]</b></a>
+    //!
+    //! The cumulant kernel is executed in the following steps
+    //!
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+    //!
+    const unsigned x = threadIdx.x;
+    const unsigned y = blockIdx.x;
+    const unsigned z = blockIdx.y;
+
+    const unsigned nx = blockDim.x;
+    const unsigned ny = gridDim.x;
+
+    const unsigned k_thread = nx * (ny * z + y) + x;
+
+    //////////////////////////////////////////////////////////////////////////
+    // run for all indices in fluidNodeIndices
+    if (k_thread < numberOfFluidNodes) {
+        //////////////////////////////////////////////////////////////////////////
+        //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on
+        //! timestep is based on the esoteric twist algorithm \ref <a
+        //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
+        //! DOI:10.3390/computation5020019 ]</b></a>
+
+        const unsigned k = fluidNodeIndices[k_thread];
+
+        Distributions27 dist;
+        if (isEvenTimestep) {
+            dist.f[dirE]    = &distributions[dirE * size_Mat];
+            dist.f[dirW]    = &distributions[dirW * size_Mat];
+            dist.f[dirN]    = &distributions[dirN * size_Mat];
+            dist.f[dirS]    = &distributions[dirS * size_Mat];
+            dist.f[dirT]    = &distributions[dirT * size_Mat];
+            dist.f[dirB]    = &distributions[dirB * size_Mat];
+            dist.f[dirNE]   = &distributions[dirNE * size_Mat];
+            dist.f[dirSW]   = &distributions[dirSW * size_Mat];
+            dist.f[dirSE]   = &distributions[dirSE * size_Mat];
+            dist.f[dirNW]   = &distributions[dirNW * size_Mat];
+            dist.f[dirTE]   = &distributions[dirTE * size_Mat];
+            dist.f[dirBW]   = &distributions[dirBW * size_Mat];
+            dist.f[dirBE]   = &distributions[dirBE * size_Mat];
+            dist.f[dirTW]   = &distributions[dirTW * size_Mat];
+            dist.f[dirTN]   = &distributions[dirTN * size_Mat];
+            dist.f[dirBS]   = &distributions[dirBS * size_Mat];
+            dist.f[dirBN]   = &distributions[dirBN * size_Mat];
+            dist.f[dirTS]   = &distributions[dirTS * size_Mat];
+            dist.f[dirZERO] = &distributions[dirZERO * size_Mat];
+            dist.f[dirTNE]  = &distributions[dirTNE * size_Mat];
+            dist.f[dirTSW]  = &distributions[dirTSW * size_Mat];
+            dist.f[dirTSE]  = &distributions[dirTSE * size_Mat];
+            dist.f[dirTNW]  = &distributions[dirTNW * size_Mat];
+            dist.f[dirBNE]  = &distributions[dirBNE * size_Mat];
+            dist.f[dirBSW]  = &distributions[dirBSW * size_Mat];
+            dist.f[dirBSE]  = &distributions[dirBSE * size_Mat];
+            dist.f[dirBNW]  = &distributions[dirBNW * size_Mat];
+        } else {
+            dist.f[dirW]    = &distributions[dirE * size_Mat];
+            dist.f[dirE]    = &distributions[dirW * size_Mat];
+            dist.f[dirS]    = &distributions[dirN * size_Mat];
+            dist.f[dirN]    = &distributions[dirS * size_Mat];
+            dist.f[dirB]    = &distributions[dirT * size_Mat];
+            dist.f[dirT]    = &distributions[dirB * size_Mat];
+            dist.f[dirSW]   = &distributions[dirNE * size_Mat];
+            dist.f[dirNE]   = &distributions[dirSW * size_Mat];
+            dist.f[dirNW]   = &distributions[dirSE * size_Mat];
+            dist.f[dirSE]   = &distributions[dirNW * size_Mat];
+            dist.f[dirBW]   = &distributions[dirTE * size_Mat];
+            dist.f[dirTE]   = &distributions[dirBW * size_Mat];
+            dist.f[dirTW]   = &distributions[dirBE * size_Mat];
+            dist.f[dirBE]   = &distributions[dirTW * size_Mat];
+            dist.f[dirBS]   = &distributions[dirTN * size_Mat];
+            dist.f[dirTN]   = &distributions[dirBS * size_Mat];
+            dist.f[dirTS]   = &distributions[dirBN * size_Mat];
+            dist.f[dirBN]   = &distributions[dirTS * size_Mat];
+            dist.f[dirZERO] = &distributions[dirZERO * size_Mat];
+            dist.f[dirBSW]  = &distributions[dirTNE * size_Mat];
+            dist.f[dirBNE]  = &distributions[dirTSW * size_Mat];
+            dist.f[dirBNW]  = &distributions[dirTSE * size_Mat];
+            dist.f[dirBSE]  = &distributions[dirTNW * size_Mat];
+            dist.f[dirTSW]  = &distributions[dirBNE * size_Mat];
+            dist.f[dirTNE]  = &distributions[dirBSW * size_Mat];
+            dist.f[dirTNW]  = &distributions[dirBSE * size_Mat];
+            dist.f[dirTSE]  = &distributions[dirBNW * size_Mat];
+        }
+        ////////////////////////////////////////////////////////////////////////////////
+        //! - Set neighbor indices (necessary for indirect addressing)
+        uint kw   = neighborX[k];
+        uint ks   = neighborY[k];
+        uint kb   = neighborZ[k];
+        uint ksw  = neighborY[kw];
+        uint kbw  = neighborZ[kw];
+        uint kbs  = neighborZ[ks];
+        uint kbsw = neighborZ[ksw];
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Set local distributions
+        //!
+        real mfcbb = (dist.f[dirE])[k];
+        real mfabb = (dist.f[dirW])[kw];
+        real mfbcb = (dist.f[dirN])[k];
+        real mfbab = (dist.f[dirS])[ks];
+        real mfbbc = (dist.f[dirT])[k];
+        real mfbba = (dist.f[dirB])[kb];
+        real mfccb = (dist.f[dirNE])[k];
+        real mfaab = (dist.f[dirSW])[ksw];
+        real mfcab = (dist.f[dirSE])[ks];
+        real mfacb = (dist.f[dirNW])[kw];
+        real mfcbc = (dist.f[dirTE])[k];
+        real mfaba = (dist.f[dirBW])[kbw];
+        real mfcba = (dist.f[dirBE])[kb];
+        real mfabc = (dist.f[dirTW])[kw];
+        real mfbcc = (dist.f[dirTN])[k];
+        real mfbaa = (dist.f[dirBS])[kbs];
+        real mfbca = (dist.f[dirBN])[kb];
+        real mfbac = (dist.f[dirTS])[ks];
+        real mfbbb = (dist.f[dirZERO])[k];
+        real mfccc = (dist.f[dirTNE])[k];
+        real mfaac = (dist.f[dirTSW])[ksw];
+        real mfcac = (dist.f[dirTSE])[ks];
+        real mfacc = (dist.f[dirTNW])[kw];
+        real mfcca = (dist.f[dirBNE])[kb];
+        real mfaaa = (dist.f[dirBSW])[kbsw];
+        real mfcaa = (dist.f[dirBSE])[kbs];
+        real mfaca = (dist.f[dirBNW])[kbw];
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
+        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+        //!
+        real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
+                     (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) +
+                      ((mfacb + mfcab) + (mfaab + mfccb))) +
+                     ((mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc))) +
+                    mfbbb;
+
+        real rho   = c1o1 + drho;
+        real OOrho = c1o1 / rho;
+
+        real vvx = ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
+                    (((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) + (mfcbb - mfabb)) *
+                   OOrho;
+        real vvy = ((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
+                    (((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) + (mfbcb - mfbab)) *
+                   OOrho;
+        real vvz = ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
+                    (((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) + (mfbbc - mfbba)) *
+                   OOrho;
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Add half of the acceleration (body force) to the velocity as in Eq. (42) \ref
+        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+        //!
+        real factor = c1o1;
+        for (size_t i = 1; i <= level; i++) {
+            factor *= c2o1;
+        }
+        real fx = forces[0] / factor;
+        real fy = forces[1] / factor;
+        real fz = forces[2] / factor;
+        vvx += fx * c1o2;
+        vvy += fy * c1o2;
+        vvz += fz * c1o2;
+        ////////////////////////////////////////////////////////////////////////////////////
+        // calculate the square of velocities for this lattice node
+        real vx2 = vvx * vvx;
+        real vy2 = vvy * vvy;
+        real vz2 = vvz * vvz;
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Set relaxation limiters for third order cumulants to default value \f$ \lambda=0.001 \f$ according to
+        //! section 6 in \ref <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+        //!
+        real wadjust;
+        real qudricLimitP = quadricLimiters[0];
+        real qudricLimitM = quadricLimiters[1];
+        real qudricLimitD = quadricLimiters[2];
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Chimera transform from well conditioned distributions to central moments as defined in Appendix J in \ref
+        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> see also Eq. (6)-(14) in \ref <a
+        //! href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
+        //! ]</b></a>
+        //!
+        ////////////////////////////////////////////////////////////////////////////////////
+        // Z - Dir
+        forwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
+        forwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2, c9o1, c1o9);
+        forwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
+        forwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2, c9o1, c1o9);
+        forwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2, c9o4, c4o9);
+        forwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2, c9o1, c1o9);
+        forwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
+        forwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2, c9o1, c1o9);
+        forwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        // Y - Dir
+        forwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2, c6o1, c1o6);
+        forwardChimera(mfaab, mfabb, mfacb, vvy, vy2);
+        forwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
+        forwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2, c3o2, c2o3);
+        forwardChimera(mfbab, mfbbb, mfbcb, vvy, vy2);
+        forwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2, c9o2, c2o9);
+        forwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2, c6o1, c1o6);
+        forwardChimera(mfcab, mfcbb, mfccb, vvy, vy2);
+        forwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        // X - Dir
+        forwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
+        forwardChimera(mfaba, mfbba, mfcba, vvx, vx2);
+        forwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
+        forwardChimera(mfaab, mfbab, mfcab, vvx, vx2);
+        forwardChimera(mfabb, mfbbb, mfcbb, vvx, vx2);
+        forwardChimera(mfacb, mfbcb, mfccb, vvx, vx2);
+        forwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
+        forwardChimera(mfabc, mfbbc, mfcbc, vvx, vx2);
+        forwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c3o1, c1o9);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Setting relaxation rates for non-hydrodynamic cumulants (default values). Variable names and equations
+        //! according to <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+        //!  => [NAME IN PAPER]=[NAME IN CODE]=[DEFAULT VALUE].
+        //!  - Trace of second order cumulants \f$ C_{200}+C_{020}+C_{002} \f$ used to adjust bulk
+        //!  viscosity:\f$\omega_2=OxxPyyPzz=1.0 \f$.
+        //!  - Third order cumulants \f$ C_{120}+C_{102}, C_{210}+C_{012}, C_{201}+C_{021} \f$: \f$ \omega_3=OxyyPxzz
+        //!  \f$ set according to Eq. (111) with simplifications assuming \f$ \omega_2=1.0\f$.
+        //!  - Third order cumulants \f$ C_{120}-C_{102}, C_{210}-C_{012}, C_{201}-C_{021} \f$: \f$ \omega_4 = OxyyMxzz
+        //!  \f$ set according to Eq. (112) with simplifications assuming \f$ \omega_2 = 1.0\f$.
+        //!  - Third order cumulants \f$ C_{111} \f$: \f$ \omega_5 = Oxyz \f$ set according to Eq. (113) with
+        //!  simplifications assuming \f$ \omega_2 = 1.0\f$  (modify for different bulk viscosity).
+        //!  - Fourth order cumulants \f$ C_{220}, C_{202}, C_{022}, C_{211}, C_{121}, C_{112} \f$: for simplification
+        //!  all set to the same default value \f$ \omega_6=\omega_7=\omega_8=O4=1.0 \f$.
+        //!  - Fifth order cumulants \f$ C_{221}, C_{212}, C_{122}\f$: \f$\omega_9=O5=1.0\f$.
+        //!  - Sixth order cumulant \f$ C_{222}\f$: \f$\omega_{10}=O6=1.0\f$.
+        //!
+        ////////////////////////////////////////////////////////////
+        // 2.
+        real OxxPyyPzz = c1o1;
+        ////////////////////////////////////////////////////////////
+        // 3.
+        real OxyyPxzz = c8o1 * (-c2o1 + omega) * (c1o1 + c2o1 * omega) / (-c8o1 - c14o1 * omega + c7o1 * omega * omega);
+        real OxyyMxzz =
+            c8o1 * (-c2o1 + omega) * (-c7o1 + c4o1 * omega) / (c56o1 - c50o1 * omega + c9o1 * omega * omega);
+        real Oxyz = c24o1 * (-c2o1 + omega) * (-c2o1 - c7o1 * omega + c3o1 * omega * omega) /
+                    (c48o1 + c152o1 * omega - c130o1 * omega * omega + c29o1 * omega * omega * omega);
+        ////////////////////////////////////////////////////////////
+        // 4.
+        real O4 = c1o1;
+        ////////////////////////////////////////////////////////////
+        // 5.
+        real O5 = c1o1;
+        ////////////////////////////////////////////////////////////
+        // 6.
+        real O6 = c1o1;
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - A and B: parameters for fourth order convergence of the diffusion term according to Eq. (114) and (115)
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a> with simplifications assuming \f$ \omega_2 = 1.0 \f$ (modify for
+        //! different bulk viscosity).
+        //!
+        real A = (c4o1 + c2o1 * omega - c3o1 * omega * omega) / (c2o1 - c7o1 * omega + c5o1 * omega * omega);
+        real B = (c4o1 + c28o1 * omega - c14o1 * omega * omega) / (c6o1 - c21o1 * omega + c15o1 * omega * omega);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Compute cumulants from central moments according to Eq. (20)-(23) in
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+        //!
+        ////////////////////////////////////////////////////////////
+        // 4.
+        real CUMcbb = mfcbb - ((mfcaa + c1o3) * mfabb + c2o1 * mfbba * mfbab) * OOrho;
+        real CUMbcb = mfbcb - ((mfaca + c1o3) * mfbab + c2o1 * mfbba * mfabb) * OOrho;
+        real CUMbbc = mfbbc - ((mfaac + c1o3) * mfbba + c2o1 * mfbab * mfabb) * OOrho;
+
+        real CUMcca =
+            mfcca - (((mfcaa * mfaca + c2o1 * mfbba * mfbba) + c1o3 * (mfcaa + mfaca)) * OOrho - c1o9 * (drho * OOrho));
+        real CUMcac =
+            mfcac - (((mfcaa * mfaac + c2o1 * mfbab * mfbab) + c1o3 * (mfcaa + mfaac)) * OOrho - c1o9 * (drho * OOrho));
+        real CUMacc =
+            mfacc - (((mfaac * mfaca + c2o1 * mfabb * mfabb) + c1o3 * (mfaac + mfaca)) * OOrho - c1o9 * (drho * OOrho));
+        ////////////////////////////////////////////////////////////
+        // 5.
+        real CUMbcc =
+            mfbcc - ((mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb + mfbba * mfabc)) +
+                     c1o3 * (mfbca + mfbac)) *
+                        OOrho;
+        real CUMcbc =
+            mfcbc - ((mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab + mfbba * mfbac)) +
+                     c1o3 * (mfcba + mfabc)) *
+                        OOrho;
+        real CUMccb =
+            mfccb - ((mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca + mfabb * mfcba)) +
+                     c1o3 * (mfacb + mfcab)) *
+                        OOrho;
+        ////////////////////////////////////////////////////////////
+        // 6.
+        real CUMccc = mfccc + ((-c4o1 * mfbbb * mfbbb - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca) -
+                                c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc) -
+                                c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) *
+                                   OOrho +
+                               (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac) +
+                                c2o1 * (mfcaa * mfaca * mfaac) + c16o1 * mfbba * mfbab * mfabb) *
+                                   OOrho * OOrho -
+                               c1o3 * (mfacc + mfcac + mfcca) * OOrho - c1o9 * (mfcaa + mfaca + mfaac) * OOrho +
+                               (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba) +
+                                (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 * (mfaac + mfaca + mfcaa)) *
+                                   OOrho * OOrho * c2o3 +
+                               c1o27 * ((drho * drho - drho) * OOrho * OOrho));
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Compute linear combinations of second and third order cumulants
+        //!
+        ////////////////////////////////////////////////////////////
+        // 2.
+        real mxxPyyPzz = mfcaa + mfaca + mfaac;
+        real mxxMyy    = mfcaa - mfaca;
+        real mxxMzz    = mfcaa - mfaac;
+        ////////////////////////////////////////////////////////////
+        // 3.
+        real mxxyPyzz = mfcba + mfabc;
+        real mxxyMyzz = mfcba - mfabc;
+
+        real mxxzPyyz = mfcab + mfacb;
+        real mxxzMyyz = mfcab - mfacb;
+
+        real mxyyPxzz = mfbca + mfbac;
+        real mxyyMxzz = mfbca - mfbac;
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        // incl. correction
+        ////////////////////////////////////////////////////////////
+        //! - Compute velocity  gradients from second order cumulants according to Eq. (27)-(32)
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a> Further explanations of the correction in viscosity in Appendix H of
+        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> Note that the division by rho is omitted here as we need rho times
+        //! the gradients later.
+        //!
+        real Dxy  = -c3o1 * omega * mfbba;
+        real Dxz  = -c3o1 * omega * mfbab;
+        real Dyz  = -c3o1 * omega * mfabb;
+        real dxux = c1o2 * (-omega) * (mxxMyy + mxxMzz) + c1o2 * OxxPyyPzz * (mfaaa - mxxPyyPzz);
+        real dyuy = dxux + omega * c3o2 * mxxMyy;
+        real dzuz = dxux + omega * c3o2 * mxxMzz;
+        ////////////////////////////////////////////////////////////
+        //! - Relaxation of second order cumulants with correction terms according to Eq. (33)-(35) in
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+        //!
+        mxxPyyPzz +=
+            OxxPyyPzz * (mfaaa - mxxPyyPzz) - c3o1 * (c1o1 - c1o2 * OxxPyyPzz) * (vx2 * dxux + vy2 * dyuy + vz2 * dzuz);
+        mxxMyy += omega * (-mxxMyy) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vy2 * dyuy);
+        mxxMzz += omega * (-mxxMzz) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vz2 * dzuz);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        ////no correction
+        // mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz);
+        // mxxMyy += -(-omega) * (-mxxMyy);
+        // mxxMzz += -(-omega) * (-mxxMzz);
+        //////////////////////////////////////////////////////////////////////////
+        mfabb += omega * (-mfabb);
+        mfbab += omega * (-mfbab);
+        mfbba += omega * (-mfbba);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        // relax
+        //////////////////////////////////////////////////////////////////////////
+        // incl. limiter
+        //! - Relaxation of third order cumulants including limiter according to Eq. (116)-(123)
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+        //!
+        wadjust = Oxyz + (c1o1 - Oxyz) * abs(mfbbb) / (abs(mfbbb) + qudricLimitD);
+        mfbbb += wadjust * (-mfbbb);
+        wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxxyPyzz) / (abs(mxxyPyzz) + qudricLimitP);
+        mxxyPyzz += wadjust * (-mxxyPyzz);
+        wadjust = OxyyMxzz + (c1o1 - OxyyMxzz) * abs(mxxyMyzz) / (abs(mxxyMyzz) + qudricLimitM);
+        mxxyMyzz += wadjust * (-mxxyMyzz);
+        wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxxzPyyz) / (abs(mxxzPyyz) + qudricLimitP);
+        mxxzPyyz += wadjust * (-mxxzPyyz);
+        wadjust = OxyyMxzz + (c1o1 - OxyyMxzz) * abs(mxxzMyyz) / (abs(mxxzMyyz) + qudricLimitM);
+        mxxzMyyz += wadjust * (-mxxzMyyz);
+        wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxyyPxzz) / (abs(mxyyPxzz) + qudricLimitP);
+        mxyyPxzz += wadjust * (-mxyyPxzz);
+        wadjust = OxyyMxzz + (c1o1 - OxyyMxzz) * abs(mxyyMxzz) / (abs(mxyyMxzz) + qudricLimitM);
+        mxyyMxzz += wadjust * (-mxyyMxzz);
+        //////////////////////////////////////////////////////////////////////////
+        // no limiter
+        // mfbbb += OxyyMxzz * (-mfbbb);
+        // mxxyPyzz += OxyyPxzz * (-mxxyPyzz);
+        // mxxyMyzz += OxyyMxzz * (-mxxyMyzz);
+        // mxxzPyyz += OxyyPxzz * (-mxxzPyyz);
+        // mxxzMyyz += OxyyMxzz * (-mxxzMyyz);
+        // mxyyPxzz += OxyyPxzz * (-mxyyPxzz);
+        // mxyyMxzz += OxyyMxzz * (-mxyyMxzz);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Compute inverse linear combinations of second and third order cumulants
+        //!
+        mfcaa = c1o3 * (mxxMyy + mxxMzz + mxxPyyPzz);
+        mfaca = c1o3 * (-c2o1 * mxxMyy + mxxMzz + mxxPyyPzz);
+        mfaac = c1o3 * (mxxMyy - c2o1 * mxxMzz + mxxPyyPzz);
+
+        mfcba = (mxxyMyzz + mxxyPyzz) * c1o2;
+        mfabc = (-mxxyMyzz + mxxyPyzz) * c1o2;
+        mfcab = (mxxzMyyz + mxxzPyyz) * c1o2;
+        mfacb = (-mxxzMyyz + mxxzPyyz) * c1o2;
+        mfbca = (mxyyMxzz + mxyyPxzz) * c1o2;
+        mfbac = (-mxyyMxzz + mxyyPxzz) * c1o2;
+        //////////////////////////////////////////////////////////////////////////
+
+        //////////////////////////////////////////////////////////////////////////
+        // 4.
+        // no limiter
+        //! - Relax fourth order cumulants to modified equilibrium for fourth order convergence of diffusion according
+        //! to Eq. (43)-(48) <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+        //!
+        CUMacc = -O4 * (c1o1 / omega - c1o2) * (dyuy + dzuz) * c2o3 * A + (c1o1 - O4) * (CUMacc);
+        CUMcac = -O4 * (c1o1 / omega - c1o2) * (dxux + dzuz) * c2o3 * A + (c1o1 - O4) * (CUMcac);
+        CUMcca = -O4 * (c1o1 / omega - c1o2) * (dyuy + dxux) * c2o3 * A + (c1o1 - O4) * (CUMcca);
+        CUMbbc = -O4 * (c1o1 / omega - c1o2) * Dxy * c1o3 * B + (c1o1 - O4) * (CUMbbc);
+        CUMbcb = -O4 * (c1o1 / omega - c1o2) * Dxz * c1o3 * B + (c1o1 - O4) * (CUMbcb);
+        CUMcbb = -O4 * (c1o1 / omega - c1o2) * Dyz * c1o3 * B + (c1o1 - O4) * (CUMcbb);
+
+        //////////////////////////////////////////////////////////////////////////
+        // 5.
+        CUMbcc += O5 * (-CUMbcc);
+        CUMcbc += O5 * (-CUMcbc);
+        CUMccb += O5 * (-CUMccb);
+
+        //////////////////////////////////////////////////////////////////////////
+        // 6.
+        CUMccc += O6 * (-CUMccc);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Compute central moments from post collision cumulants according to Eq. (53)-(56) in
+        //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+        //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+        //!
+
+        //////////////////////////////////////////////////////////////////////////
+        // 4.
+        mfcbb = CUMcbb + c1o3 * ((c3o1 * mfcaa + c1o1) * mfabb + c6o1 * mfbba * mfbab) * OOrho;
+        mfbcb = CUMbcb + c1o3 * ((c3o1 * mfaca + c1o1) * mfbab + c6o1 * mfbba * mfabb) * OOrho;
+        mfbbc = CUMbbc + c1o3 * ((c3o1 * mfaac + c1o1) * mfbba + c6o1 * mfbab * mfabb) * OOrho;
+
+        mfcca =
+            CUMcca +
+            (((mfcaa * mfaca + c2o1 * mfbba * mfbba) * c9o1 + c3o1 * (mfcaa + mfaca)) * OOrho - (drho * OOrho)) * c1o9;
+        mfcac =
+            CUMcac +
+            (((mfcaa * mfaac + c2o1 * mfbab * mfbab) * c9o1 + c3o1 * (mfcaa + mfaac)) * OOrho - (drho * OOrho)) * c1o9;
+        mfacc =
+            CUMacc +
+            (((mfaac * mfaca + c2o1 * mfabb * mfabb) * c9o1 + c3o1 * (mfaac + mfaca)) * OOrho - (drho * OOrho)) * c1o9;
+
+        //////////////////////////////////////////////////////////////////////////
+        // 5.
+        mfbcc = CUMbcc + c1o3 *
+                             (c3o1 * (mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb +
+                                      c2o1 * (mfbab * mfacb + mfbba * mfabc)) +
+                              (mfbca + mfbac)) *
+                             OOrho;
+        mfcbc = CUMcbc + c1o3 *
+                             (c3o1 * (mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb +
+                                      c2o1 * (mfabb * mfcab + mfbba * mfbac)) +
+                              (mfcba + mfabc)) *
+                             OOrho;
+        mfccb = CUMccb + c1o3 *
+                             (c3o1 * (mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb +
+                                      c2o1 * (mfbab * mfbca + mfabb * mfcba)) +
+                              (mfacb + mfcab)) *
+                             OOrho;
+
+        //////////////////////////////////////////////////////////////////////////
+        // 6.
+        mfccc = CUMccc - ((-c4o1 * mfbbb * mfbbb - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca) -
+                           c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc) -
+                           c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) *
+                              OOrho +
+                          (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac) +
+                           c2o1 * (mfcaa * mfaca * mfaac) + c16o1 * mfbba * mfbab * mfabb) *
+                              OOrho * OOrho -
+                          c1o3 * (mfacc + mfcac + mfcca) * OOrho - c1o9 * (mfcaa + mfaca + mfaac) * OOrho +
+                          (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba) +
+                           (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 * (mfaac + mfaca + mfcaa)) *
+                              OOrho * OOrho * c2o3 +
+                          c1o27 * ((drho * drho - drho) * OOrho * OOrho));
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! -  Add acceleration (body force) to first order cumulants according to Eq. (85)-(87) in
+        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+        //!
+        mfbaa = -mfbaa;
+        mfaba = -mfaba;
+        mfaab = -mfaab;
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Chimera transform from central moments to well conditioned distributions as defined in Appendix J in
+        //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+        //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> see also Eq. (88)-(96) in <a
+        //! href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
+        //! ]</b></a>
+        //!
+        ////////////////////////////////////////////////////////////////////////////////////
+        // X - Dir
+        backwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
+        backwardChimera(mfaba, mfbba, mfcba, vvx, vx2);
+        backwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
+        backwardChimera(mfaab, mfbab, mfcab, vvx, vx2);
+        backwardChimera(mfabb, mfbbb, mfcbb, vvx, vx2);
+        backwardChimera(mfacb, mfbcb, mfccb, vvx, vx2);
+        backwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
+        backwardChimera(mfabc, mfbbc, mfcbc, vvx, vx2);
+        backwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c9o1, c1o9);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        // Y - Dir
+        backwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2, c6o1, c1o6);
+        backwardChimera(mfaab, mfabb, mfacb, vvy, vy2);
+        backwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
+        backwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2, c3o2, c2o3);
+        backwardChimera(mfbab, mfbbb, mfbcb, vvy, vy2);
+        backwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2, c9o2, c2o9);
+        backwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2, c6o1, c1o6);
+        backwardChimera(mfcab, mfcbb, mfccb, vvy, vy2);
+        backwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        // Z - Dir
+        backwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
+        backwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2, c9o1, c1o9);
+        backwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
+        backwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2, c9o1, c1o9);
+        backwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2, c9o4, c4o9);
+        backwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2, c9o1, c1o9);
+        backwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
+        backwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2, c9o1, c1o9);
+        backwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);
+
+        ////////////////////////////////////////////////////////////////////////////////////
+        //! - Write distributions: style of reading and writing the distributions from/to
+        //! stored arrays dependent on timestep is based on the esoteric twist algorithm
+        //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
+        //! DOI:10.3390/computation5020019 ]</b></a>
+        //!
+        (dist.f[dirE])[k]      = mfabb;
+        (dist.f[dirW])[kw]     = mfcbb;
+        (dist.f[dirN])[k]      = mfbab;
+        (dist.f[dirS])[ks]     = mfbcb;
+        (dist.f[dirT])[k]      = mfbba;
+        (dist.f[dirB])[kb]     = mfbbc;
+        (dist.f[dirNE])[k]     = mfaab;
+        (dist.f[dirSW])[ksw]   = mfccb;
+        (dist.f[dirSE])[ks]    = mfacb;
+        (dist.f[dirNW])[kw]    = mfcab;
+        (dist.f[dirTE])[k]     = mfaba;
+        (dist.f[dirBW])[kbw]   = mfcbc;
+        (dist.f[dirBE])[kb]    = mfabc;
+        (dist.f[dirTW])[kw]    = mfcba;
+        (dist.f[dirTN])[k]     = mfbaa;
+        (dist.f[dirBS])[kbs]   = mfbcc;
+        (dist.f[dirBN])[kb]    = mfbac;
+        (dist.f[dirTS])[ks]    = mfbca;
+        (dist.f[dirZERO])[k]   = mfbbb;
+        (dist.f[dirTNE])[k]    = mfaaa;
+        (dist.f[dirTSE])[ks]   = mfaca;
+        (dist.f[dirBNE])[kb]   = mfaac;
+        (dist.f[dirBSE])[kbs]  = mfacc;
+        (dist.f[dirTNW])[kw]   = mfcaa;
+        (dist.f[dirTSW])[ksw]  = mfcca;
+        (dist.f[dirBNW])[kbw]  = mfcac;
+        (dist.f[dirBSW])[kbsw] = mfccc;
+    }
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream_Device.cuh b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream_Device.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..035e438ccf8df70c5df43c70f0b4a5ffe160acc8
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream_Device.cuh
@@ -0,0 +1,20 @@
+#ifndef LB_Kernel_CUMULANT_K17_COMP_CHIM_SPARSE_H
+#define LB_Kernel_CUMULANT_K17_COMP_CHIM_SPARSE_H
+
+#include <DataTypes.h>
+#include <curand.h>
+
+extern "C" __global__ void LB_Kernel_CumulantK17CompChimStream(
+	real omega,
+	uint* neighborX,
+	uint* neighborY,
+	uint* neighborZ,
+	real* distributions,
+	unsigned long size_Mat,
+	int level,
+	real* forces,
+	real* quadricLimiters,
+	bool isEvenTimestep,
+	const uint* fluidNodeIndices,
+	uint numberOfFluidNodes);
+#endif
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.cpp b/src/gpu/VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.cpp
index ffc3491752c34b5974779fe8c927300bf21a1bb9..6e9688f5724e347df53397eb3880b3758c236730 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.cpp
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.cpp
@@ -11,6 +11,7 @@
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17Comp.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Unified/CumulantK17Unified.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chim/CumulantK17CompChim.h"
+#include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17chimStream/CumulantK17CompChimStream.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17Bulk/CumulantK17BulkComp.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantAll4/CumulantAll4CompSP27.h"
 #include "Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK18/CumulantK18Comp.h"
@@ -139,6 +140,9 @@ std::shared_ptr<Kernel> KernelFactoryImp::makeKernel(std::shared_ptr<Parameter>
     } else if (kernel == "CumulantK17CompChim") {
         newKernel     = CumulantK17CompChim::getNewInstance(para, level);
         checkStrategy = FluidFlowCompStrategy::getInstance();
+    } else if (kernel == "CumulantK17CompChimStream") {
+        newKernel     = CumulantK17CompChimStream::getNewInstance(para, level);
+        checkStrategy = FluidFlowCompStrategy::getInstance();
     } else if (kernel == "CumulantAll4CompSP27") {
         newKernel     = CumulantAll4CompSP27::getNewInstance(para, level);
         checkStrategy = FluidFlowCompStrategy::getInstance();
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
index a16566ecb734f8c8c5d1e7d5f01df884e6464f6a..a055542983381b9f843f18e9c5f069729c329c79 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
@@ -9,12 +9,14 @@
 #include "Communication/Communicator.h"
 #include "Communication/ExchangeData27.h"
 #include "Parameter/Parameter.h"
+#include "Parameter/CudaStreamManager.h"
 #include "GPU/GPU_Interface.h"
 #include "basics/utilities/UbFileOutputASCII.h"
 //////////////////////////////////////////////////////////////////////////
 #include "Output/MeasurePointWriter.hpp"
 #include "Output/AnalysisData.hpp"
 #include "Output/InterfaceDebugWriter.hpp"
+#include "Output/EdgeNodeDebugWriter.hpp"
 #include "Output/VeloASCIIWriter.hpp"
 //////////////////////////////////////////////////////////////////////////
 #include "Utilities/Buffer2D.hpp"
@@ -34,6 +36,7 @@
 #include "Calculation/Cp.h"
 #include "Calculation/Calc2ndMoments.h"
 #include "Calculation/CalcMedian.h"
+#include "Calculation/CalcTurbulenceIntensity.h"
 #include "Calculation/ForceCalculations.h"
 #include "Calculation/PorousMedia.h"
 //////////////////////////////////////////////////////////////////////////
@@ -94,6 +97,11 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
 
    gridProvider->allocAndCopyForcing();
    gridProvider->allocAndCopyQuadricLimiters();
+   if (para->getKernelNeedsFluidNodeIndicesToRun()) {
+       gridProvider->allocArrays_fluidNodeIndices();
+       gridProvider->allocArrays_fluidNodeIndicesBorder();
+   }
+
    gridProvider->setDimensions();
    gridProvider->setBoundingBox();
 
@@ -110,6 +118,13 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
    if(para->getMyID() == 0) output.setConsoleOut(true);
    output.clearLogFile();
    //////////////////////////////////////////////////////////////////////////
+   // CUDA streams
+   if (para->getUseStreams()) {
+       para->getStreamManager()->launchStreams(2u);
+       para->getStreamManager()->createCudaEvents();
+   }
+   //////////////////////////////////////////////////////////////////////////
+   // 
    //output << para->getNeedInterface().at(0) << "\n";
    //output << para->getNeedInterface().at(1) << "\n";
    //output << para->getNeedInterface().at(2) << "\n";
@@ -138,10 +153,7 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
    /////////////////////////////////////////////////////////////////////////
    cudaManager->setMemsizeGPU(0, true);
    //////////////////////////////////////////////////////////////////////////
-   gridProvider->allocArrays_CoordNeighborGeo();
-   gridProvider->allocArrays_BoundaryValues();
-   gridProvider->allocArrays_BoundaryQs();
-   gridProvider->allocArrays_OffsetScale();
+   allocNeighborsOffsetsScalesAndBoundaries(gridProvider);
 
 	for( SPtr<PreCollisionInteractor> actuator: para->getActuators()){
 		actuator->init(para.get(), gridProvider.get(), cudaManager.get());
@@ -213,12 +225,20 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
    //////////////////////////////////////////////////////////////////////////
    if (para->getCalcMedian())
    {
-       output << "alloc Calculation for Mean Valus  " << "\n";
+       output << "alloc Calculation for Mean Values  " << "\n";
 	   if (para->getDiffOn())	allocMedianAD(para.get(), cudaManager.get());
 	   else						allocMedian(para.get(), cudaManager.get());
    }
 
 
+   //////////////////////////////////////////////////////////////////////////
+   // Turbulence Intensity
+   //////////////////////////////////////////////////////////////////////////
+   if (para->getCalcTurbulenceIntensity()) {
+       output << "alloc arrays for calculating Turbulence Intensity  " << "\n";
+       allocTurbulenceIntensity(para.get(), cudaManager.get());
+   }
+
    //////////////////////////////////////////////////////////////////////////
    //allocate memory and initialize 2nd, 3rd and higher order moments
    //////////////////////////////////////////////////////////////////////////
@@ -310,7 +330,14 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
    //findPressQShip(para);
    //output << "done.\n";
 
-
+   //////////////////////////////////////////////////////////////////////////
+   // find indices of corner nodes for multiGPU communication
+   //////////////////////////////////////////////////////////////////////////
+   if (para->getDevices().size() > 2) {
+       output << "Find indices of edge nodes for multiGPU communication ...";
+       para->findEdgeNodesCommMultiGPU();
+       output << "done.\n";
+   }
    //////////////////////////////////////////////////////////////////////////
    //Memory alloc for CheckPoint / Restart
    //////////////////////////////////////////////////////////////////////////
@@ -362,6 +389,11 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
 	   output << "done.\n";
    }
 
+   //////////////////////////////////////////////////////////////////////////
+   // Init UpdateGrid
+   //////////////////////////////////////////////////////////////////////////
+   this->updateGrid27 = std::make_unique<UpdateGrid27>(para, communicator, cudaManager, pm, kernels);
+
    //////////////////////////////////////////////////////////////////////////
    //Print Init
    //////////////////////////////////////////////////////////////////////////
@@ -373,10 +405,29 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
 
    //////////////////////////////////////////////////////////////////////////
    output << "used Device Memory: " << cudaManager->getMemsizeGPU() / 1000000.0 << " MB\n";
+   // std::cout << "Process " << communicator.getPID() <<": used device memory" << cudaManager->getMemsizeGPU() / 1000000.0 << " MB\n" << std::endl;
    //////////////////////////////////////////////////////////////////////////
 
    //InterfaceDebugWriter::writeInterfaceLinesDebugCF(para.get());
    //InterfaceDebugWriter::writeInterfaceLinesDebugFC(para.get());
+
+   // writers for version with communication hiding
+   //    if(para->getNumprocs() > 1 && para->getUseStreams()){
+   //        InterfaceDebugWriter::writeInterfaceFCC_Send(para.get());
+   //        InterfaceDebugWriter::writeInterfaceCFC_Recv(para.get());
+   //        InterfaceDebugWriter::writeSendNodesStream(para.get());
+   //        InterfaceDebugWriter::writeRecvNodesStream(para.get());
+   //        EdgeNodeDebugWriter::writeEdgeNodesXZ_Send(para);
+   //        EdgeNodeDebugWriter::writeEdgeNodesXZ_Recv(para);
+   //    }
+}
+
+void Simulation::allocNeighborsOffsetsScalesAndBoundaries(SPtr<GridProvider> &gridProvider)
+{
+    gridProvider->allocArrays_CoordNeighborGeo();
+    gridProvider->allocArrays_OffsetScale();
+    gridProvider->allocArrays_BoundaryValues(); // allocArrays_BoundaryValues() has to be called after allocArrays_OffsetScale() because of initCommunicationArraysForCommAfterFinetoCoarse() 
+    gridProvider->allocArrays_BoundaryQs();
 }
 
 void Simulation::bulk()
@@ -387,6 +438,7 @@ void Simulation::bulk()
 void Simulation::run()
 {
    unsigned int t, t_prev;
+   uint t_turbulenceIntensity = 0;
    unsigned int t_MP = 0;
 
    //////////////////////////////////////////////////////////////////////////
@@ -418,9 +470,8 @@ void Simulation::run()
 	////////////////////////////////////////////////////////////////////////////////
 	for(t=para->getTStart();t<=para->getTEnd();t++)
 	{
-		
-        updateGrid27(para.get(), communicator, cudaManager.get(), pm, 0, t, kernels);
-		
+        this->updateGrid27->updateGrid(0, t);
+
 	    ////////////////////////////////////////////////////////////////////////////////
 	    //Particles
 	    ////////////////////////////////////////////////////////////////////////////////
@@ -434,7 +485,10 @@ void Simulation::run()
         // run Analyzers for kinetic energy and enstrophy for TGV in 3D
         // these analyzers only work on level 0
 	    ////////////////////////////////////////////////////////////////////////////////
-        if( this->kineticEnergyAnalyzer || this->enstrophyAnalyzer ) exchangeMultiGPU(para.get(), communicator, cudaManager.get(), 0);
+        if (this->kineticEnergyAnalyzer || this->enstrophyAnalyzer) {
+            prepareExchangeMultiGPU(para.get(), 0, -1);
+            exchangeMultiGPU(para.get(), communicator, cudaManager.get(), 0, -1);
+        }
 
 	    if( this->kineticEnergyAnalyzer ) this->kineticEnergyAnalyzer->run(t);
 	    if( this->enstrophyAnalyzer     ) this->enstrophyAnalyzer->run(t);
@@ -482,6 +536,30 @@ void Simulation::run()
         
           }
         }
+
+		if (para->getCalcTurbulenceIntensity()) {
+            for (int lev = para->getCoarse(); lev <= para->getFine(); lev++) {
+				CalcTurbulenceIntensityDevice(
+				    para->getParD(lev)->vxx,
+				    para->getParD(lev)->vyy,
+				    para->getParD(lev)->vzz,
+				    para->getParD(lev)->vxy,
+				    para->getParD(lev)->vxz,
+				    para->getParD(lev)->vyz,
+				    para->getParD(lev)->vx_mean,
+				    para->getParD(lev)->vy_mean,
+				    para->getParD(lev)->vz_mean,
+				    para->getParD(lev)->d0SP.f[0], 
+				    para->getParD(lev)->geoSP,
+				    para->getParD(lev)->neighborX_SP,
+				    para->getParD(lev)->neighborY_SP, 
+				    para->getParD(lev)->neighborZ_SP,
+				    para->getParD(lev)->size_Mat_SP,
+				    para->getParD(lev)->evenOrOdd,
+				    para->getParD(lev)->numberofthreads
+				);
+			}
+		}
         ////////////////////////////////////////////////////////////////////////////////
 
 
@@ -497,19 +575,19 @@ void Simulation::run()
             
             if( para->getDoCheckPoint() )
             {
-                output << "Dateien fuer CheckPoint kopieren t=" << t << "...\n";
+                output << "Copy data for CheckPoint t=" << t << "...\n";
                 
                 for (int lev=para->getCoarse(); lev <= para->getFine(); lev++)
                 {
                     cudaManager->cudaCopyFsForCheckPoint(lev);
                 }
                 
-                output << "Dateien fuer CheckPoint schreiben t=" << t << "...";
+                output << "Write data for CheckPoint t=" << t << "...";
 
 				const auto name = getFileName(para->getFName(), t, para->getMyID());
 				restart_object->serialize(name, para);
 
-                output << "\n fertig\n";
+                output << "\n done\n";
             }
             //////////////////////////////////////////////////////////////////////////
 			averageTimer->startTimer();
@@ -628,17 +706,18 @@ void Simulation::run()
 
 		//////////////////////////////////////////////////////////////////////////
 		averageTimer->stopTimer();
-		averageTimer->outputPerformance(t, para.get());
+		averageTimer->outputPerformance(t, para.get(), communicator);
 		//////////////////////////////////////////////////////////////////////////
 
          if( para->getPrintFiles() )
          {
-            output << "Dateien schreiben t=" << t << "...";
+            output << "Write files t=" << t << "... ";
             for (int lev=para->getCoarse(); lev <= para->getFine(); lev++)
             {
 		        //////////////////////////////////////////////////////////////////////////
 		        //exchange data for valid post process
-		        exchangeMultiGPU(para.get(), communicator, cudaManager.get(), lev);
+                prepareExchangeMultiGPU(para.get(), lev, -1);
+		        exchangeMultiGPU(para.get(), communicator, cudaManager.get(), lev, -1);
                 //////////////////////////////////////////////////////////////////////////
                //if (para->getD3Qxx()==19)
                //{
@@ -891,9 +970,20 @@ void Simulation::run()
 				resetMedian(para.get());
 				/////////////////////////////////
 			}
+            if (para->getCalcTurbulenceIntensity()) 
+			{
+                uint t_diff = t - t_turbulenceIntensity;
+                calcTurbulenceIntensity(para.get(), cudaManager.get(), t_diff);
+                //writeAllTiDatafToFile(para.get(), t);
+            }
 			////////////////////////////////////////////////////////////////////////
 			dataWriter->writeTimestep(para, t);
 			////////////////////////////////////////////////////////////////////////
+            if (para->getCalcTurbulenceIntensity()) {
+                t_turbulenceIntensity = t;
+                resetVelocityFluctuationsAndMeans(para.get(), cudaManager.get());
+            }
+			////////////////////////////////////////////////////////////////////////
             if (para->getCalcDragLift()) printDragLift(para.get(), cudaManager.get(), t);
 			////////////////////////////////////////////////////////////////////////
 			if (para->getCalcParticle()) copyAndPrintParticles(para.get(), cudaManager.get(), t, false);
@@ -907,6 +997,8 @@ void Simulation::run()
       }
 	}
 
+	/////////////////////////////////////////////////////////////////////////
+
 	////////////////////////////////////////////////////////////////////////////////
 	//printDragLift(para);
 	////////////////////////////////////////////////////////////////////////////////
@@ -1084,8 +1176,14 @@ void Simulation::definePMarea(std::shared_ptr<PorousMedia> pMedia)
 
 void Simulation::free()
 {
+	// Cuda Streams
+    if (para->getUseStreams()) {
+        para->getStreamManager()->destroyCudaEvents();
+        para->getStreamManager()->terminateStreams();
+	}
+
 	//CudaFreeHostMemory
-	for (int lev = para->getCoarse(); lev <= para->getFine(); lev++)
+    for (int lev = para->getCoarse(); lev <= para->getFine(); lev++)
 	{
 		//para->cudaFreeFull(lev);
 		cudaManager->cudaFreeCoord(lev);
@@ -1215,6 +1313,9 @@ void Simulation::free()
 		}
 	}
 	//////////////////////////////////////////////////////////////////////////
+	// Turbulence Intensity
+	if (para->getCalcTurbulenceIntensity()) {
+        cudaFreeTurbulenceIntensityArrays(para.get(), cudaManager.get());
 	//PreCollisionInteractors
 	for( SPtr<PreCollisionInteractor> actuator: para->getActuators()){
 		actuator->free(para.get(), cudaManager.get());
@@ -1224,4 +1325,5 @@ void Simulation::free()
 		probe->free(para.get(), cudaManager.get());
 	}
 	//////////////////////////////////////////////////////////////////////////
-}
+    }
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.h b/src/gpu/VirtualFluids_GPU/LBM/Simulation.h
index 72c86140258b01aec3b3ed00d59c271f1824d514..44da6df66ef4038da51193c11f421f90a7984200 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.h
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.h
@@ -28,13 +28,16 @@ class KernelFactory;
 class PreProcessor;
 class PreProcessorFactory;
 class TrafficMovementFactory;
+class UpdateGrid27;
 
 class VIRTUALFLUIDS_GPU_EXPORT Simulation
 {
 public:
 	Simulation(vf::gpu::Communicator& communicator);
 	void run();
-	void init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std::shared_ptr<DataWriter> dataWriter, std::shared_ptr<CudaMemoryManager> cudaManager);
+    void init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std::shared_ptr<DataWriter> dataWriter,
+              std::shared_ptr<CudaMemoryManager> cudaManager);
+    void allocNeighborsOffsetsScalesAndBoundaries(SPtr<GridProvider> &gridProvider);
 	void free();
 	void bulk();
 	void porousMedia();
@@ -104,5 +107,6 @@ protected:
 	SPtr<EnstrophyAnalyzer> enstrophyAnalyzer;
 	////////////////////////////////////////////////////////////////////////////
 
+	SPtr<UpdateGrid27> updateGrid27;
  };
 #endif
diff --git a/src/gpu/VirtualFluids_GPU/Output/EdgeNodeDebugWriter.hpp b/src/gpu/VirtualFluids_GPU/Output/EdgeNodeDebugWriter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..97b0b7421a1eb457096d191fe555122f23859ae3
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Output/EdgeNodeDebugWriter.hpp
@@ -0,0 +1,101 @@
+#ifndef EDGENODEDEBUG_HPP
+#define EDGENODEDEBUG_HPP
+
+#include <fstream>
+#include <sstream>
+#include <stdio.h>
+// #include <math.h>
+#include "Core/StringUtilities/StringUtil.h"
+#include "LBM/D3Q27.h"
+#include "LBM/LB.h"
+#include "Parameter/Parameter.h"
+#include "basics/utilities/UbSystem.h"
+#include <basics/writer/WbWriterVtkXmlBinary.h>
+#include <cmath>
+
+#include "VirtualFluids_GPU/Communication/Communicator.h"
+
+namespace EdgeNodeDebugWriter
+{
+
+void addCoordinatesToNodeVector(SPtr<LBMSimulationParameter> parH, std::vector<UbTupleFloat3> &nodesVec, int indexInNodesVector, int sparseIndexOfNode){
+            double x1           = parH->coordX_SP[sparseIndexOfNode];
+            double x2           = parH->coordY_SP[sparseIndexOfNode];
+            double x3           = parH->coordZ_SP[sparseIndexOfNode];
+            nodesVec[indexInNodesVector] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+}
+
+void writeEdgeNodesXZ_Send(SPtr<Parameter> para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    std::vector<std::string> datanames = { "SparseIndex", "ProcessNeighbor", "IndexInSendVector", "AfterFtoC" };
+    std::vector<std::vector<double>> nodedata;
+
+    int numberOfNodes = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++){
+        numberOfNodes += (int) para->getParH(level)->edgeNodesXtoZ.size();
+    }
+    nodesVec.resize(numberOfNodes);
+    nodedata.resize(datanames.size(), std::vector<double>(numberOfNodes));
+
+    int nodeCount = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        for (int u = 0; u < numberOfNodes; u++) {
+            int indexOfProcessNeighborSend = para->getParH(level)->edgeNodesXtoZ[u].indexOfProcessNeighborSend;
+            int indexInSendBuffer = para->getParH(level)->edgeNodesXtoZ[u].indexInSendBuffer;
+            int sparseIndex = para->getParH(level)->sendProcessNeighborZ[indexOfProcessNeighborSend].index[indexInSendBuffer];
+            nodedata[0][nodeCount] = sparseIndex;
+            nodedata[1][nodeCount] = indexOfProcessNeighborSend;
+            nodedata[2][nodeCount] = indexInSendBuffer;
+            nodedata[3][nodeCount] = indexInSendBuffer < para->getParH(level)->sendProcessNeighborsAfterFtoCZ[indexOfProcessNeighborSend].numberOfNodes;
+
+            addCoordinatesToNodeVector(para->getParH(level), nodesVec, nodeCount, sparseIndex);
+
+            nodeCount++;
+        }
+        std::string filenameVec = para->getFName() + "_writeEdgeNodesXZ_Send_PID_" +
+                                  std::to_string(vf::gpu::Communicator::getInstance().getPID()) + "_" +
+                                  StringUtil::toString<int>(level);
+
+        WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(filenameVec, nodesVec, datanames, nodedata);
+    }
+}
+
+void writeEdgeNodesXZ_Recv(SPtr<Parameter> para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    std::vector<std::string> datanames = { "SparseIndex", "ProcessNeighbor", "IndexInRecvVector", "AfterFtoC" };
+    std::vector<std::vector<double>> nodedata;
+
+    int numberOfNodes = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++){
+        numberOfNodes += (int) para->getParH(level)->edgeNodesXtoZ.size();
+    }
+    nodesVec.resize(numberOfNodes);
+    nodedata.resize(datanames.size(), std::vector<double>(numberOfNodes));
+
+    int nodeCount = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        for (int u = 0; u < numberOfNodes; u++) {
+            int indexOfProcessNeighborRecv = para->getParH(level)->edgeNodesXtoZ[u].indexOfProcessNeighborRecv;
+            int indexInRecvBuffer = para->getParH(level)->edgeNodesXtoZ[u].indexInRecvBuffer;
+            int sparseIndex = para->getParH(level)->recvProcessNeighborX[indexOfProcessNeighborRecv].index[indexInRecvBuffer];
+            nodedata[0][nodeCount] = sparseIndex;
+            nodedata[1][nodeCount] = indexOfProcessNeighborRecv;
+            nodedata[2][nodeCount] = indexInRecvBuffer;
+            nodedata[3][nodeCount] = indexInRecvBuffer < para->getParH(level)->recvProcessNeighborX[indexOfProcessNeighborRecv].numberOfNodes;
+
+            addCoordinatesToNodeVector(para->getParH(level), nodesVec, nodeCount, sparseIndex);
+
+            nodeCount++;
+        }
+        std::string filenameVec = para->getFName() + "_writeEdgeNodesXZ_Recv_PID_" +
+                                  std::to_string(vf::gpu::Communicator::getInstance().getPID()) + "_" +
+                                  StringUtil::toString<int>(level);
+
+        WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(filenameVec, nodesVec, datanames, nodedata);
+    }
+}
+} // namespace EdgeNodeDebugWriter
+
+#endif
diff --git a/src/gpu/VirtualFluids_GPU/Output/FileWriter.cpp b/src/gpu/VirtualFluids_GPU/Output/FileWriter.cpp
index ba8bac5939460c35c76a1ecbb378d2d69423c014..a86afac9982091eb1c12363568817c43f3d54116 100644
--- a/src/gpu/VirtualFluids_GPU/Output/FileWriter.cpp
+++ b/src/gpu/VirtualFluids_GPU/Output/FileWriter.cpp
@@ -25,10 +25,10 @@
 void FileWriter::writeInit(std::shared_ptr<Parameter> para, std::shared_ptr<CudaMemoryManager> cudaManager)
 {
     unsigned int timestep = para->getTInit();
-	for (int level = para->getCoarse(); level <= para->getFine(); level++) {
-		cudaManager->cudaCopyPrint(level);
-		writeTimestep(para, timestep, level);
-	}
+    for (int level = para->getCoarse(); level <= para->getFine(); level++) {
+        cudaManager->cudaCopyPrint(level);
+        writeTimestep(para, timestep, level);
+    }
 
     this->writeCollectionFile(para, timestep);
 
@@ -51,35 +51,35 @@ void FileWriter::writeTimestep(std::shared_ptr<Parameter> para, unsigned int tim
 {
     const unsigned int numberOfParts = para->getParH(level)->size_Mat_SP / para->getlimitOfNodesForVTK() + 1;
     std::vector<std::string> fname;
-	std::vector<std::string> fnameMed;
-	for (unsigned int i = 1; i <= numberOfParts; i++)
-	{
-		fname.push_back(para->getFName() + "_bin_lev_" + StringUtil::toString<int>(level) + "_ID_" + StringUtil::toString<int>(para->getMyID()) + "_Part_" + StringUtil::toString<int>(i) + "_t_" + StringUtil::toString<int>(timestep) + ".vtk");
-		fnameMed.push_back(para->getFName() + "_bin_median_lev_" + StringUtil::toString<int>(level) + "_ID_" + StringUtil::toString<int>(para->getMyID()) + "_Part_" + StringUtil::toString<int>(i) + "_t_" + StringUtil::toString<int>(timestep) + ".vtk");
+    std::vector<std::string> fnameMed;
+    for (unsigned int i = 1; i <= numberOfParts; i++)
+    {
+        fname.push_back(para->getFName() + "_bin_lev_" + StringUtil::toString<int>(level) + "_ID_" + StringUtil::toString<int>(para->getMyID()) + "_Part_" + StringUtil::toString<int>(i) + "_t_" + StringUtil::toString<int>(timestep) + ".vtk");
+        fnameMed.push_back(para->getFName() + "_bin_median_lev_" + StringUtil::toString<int>(level) + "_ID_" + StringUtil::toString<int>(para->getMyID()) + "_Part_" + StringUtil::toString<int>(i) + "_t_" + StringUtil::toString<int>(timestep) + ".vtk");
 
         this->fileNamesForCollectionFile.push_back( fname.back() );
         this->fileNamesForCollectionFileMedian.push_back( fnameMed.back() );
-	}
-
-	if (para->getDiffOn() == true)
-		writeUnstrucuredGridLTConc(para, level, fname);
-	else
-		writeUnstrucuredGridLT(para, level, fname);
-
-	if (para->getCalcMedian())
-	{
-		if (para->getDiffOn() == true)
-			writeUnstrucuredGridMedianLTConc(para, level, fnameMed);
-		else
-			writeUnstrucuredGridMedianLT(para, level, fnameMed);
-	}
+    }
+
+    if (para->getDiffOn() == true)
+        writeUnstrucuredGridLTConc(para, level, fname);
+    else
+        writeUnstrucuredGridLT(para, level, fname);
+
+    if (para->getCalcMedian())
+    {
+        if (para->getDiffOn() == true)
+            writeUnstrucuredGridMedianLTConc(para, level, fnameMed);
+        else
+            writeUnstrucuredGridMedianLT(para, level, fnameMed);
+    }
 }
 
 bool FileWriter::isPeriodicCell(std::shared_ptr<Parameter> para, int level, unsigned int number2, unsigned int number1, unsigned int number3, unsigned int number5)
 {
-	return (para->getParH(level)->coordX_SP[number2] < para->getParH(level)->coordX_SP[number1]) ||
-		   (para->getParH(level)->coordY_SP[number3] < para->getParH(level)->coordY_SP[number1]) ||
-		   (para->getParH(level)->coordZ_SP[number5] < para->getParH(level)->coordZ_SP[number1]);
+    return (para->getParH(level)->coordX_SP[number2] < para->getParH(level)->coordX_SP[number1]) ||
+           (para->getParH(level)->coordY_SP[number3] < para->getParH(level)->coordY_SP[number1]) ||
+           (para->getParH(level)->coordZ_SP[number5] < para->getParH(level)->coordZ_SP[number1]);
 }
 
 void VIRTUALFLUIDS_GPU_EXPORT FileWriter::writeCollectionFile(std::shared_ptr<Parameter> para, unsigned int timestep)
@@ -181,6 +181,127 @@ void FileWriter::writeUnstrucuredGridLT(std::shared_ptr<Parameter> para, int lev
     nodedatanames.push_back("vx2");
     nodedatanames.push_back("vx3");
     nodedatanames.push_back("geo");
+
+    uint firstTurbNode = (uint) nodedatanames.size();
+    if (para->getCalcTurbulenceIntensity()) {
+        nodedatanames.push_back("vxx");
+        nodedatanames.push_back("vyy");
+        nodedatanames.push_back("vzz");
+        nodedatanames.push_back("vxy");
+        nodedatanames.push_back("vxz");
+        nodedatanames.push_back("vyz");
+    }
+    unsigned int number1, number2, number3, number4, number5, number6, number7, number8;
+    uint dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8;
+    bool neighborsAreFluid;
+    unsigned int startpos = 0;
+    unsigned int endpos = 0;
+    unsigned int sizeOfNodes = 0;
+    std::vector< std::vector< double > > nodedata(nodedatanames.size());
+
+    for (unsigned int part = 0; part < fname.size(); part++)
+    {
+        if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->size_Mat_SP)
+            sizeOfNodes = para->getParH(level)->size_Mat_SP - (part * para->getlimitOfNodesForVTK());
+        else
+            sizeOfNodes = para->getlimitOfNodesForVTK();
+
+        //////////////////////////////////////////////////////////////////////////
+        startpos = part * para->getlimitOfNodesForVTK();
+        endpos = startpos + sizeOfNodes;
+        //////////////////////////////////////////////////////////////////////////
+        cells.clear();
+        nodes.resize(sizeOfNodes);
+        for (uint i = 0; i < (uint)nodedatanames.size(); i++)
+            nodedata[i].resize(sizeOfNodes);
+
+        //////////////////////////////////////////////////////////////////////////
+        for (unsigned int pos = startpos; pos < endpos; pos++)
+        {
+            if (para->getParH(level)->geoSP[pos] == GEO_FLUID)
+            {
+                //////////////////////////////////////////////////////////////////////////
+                double x1 = para->getParH(level)->coordX_SP[pos];
+                double x2 = para->getParH(level)->coordY_SP[pos];
+                double x3 = para->getParH(level)->coordZ_SP[pos];
+                //////////////////////////////////////////////////////////////////////////
+                number1 = pos;
+                dn1 = pos - startpos;
+                neighborsAreFluid = true;
+                //////////////////////////////////////////////////////////////////////////
+                nodes[dn1] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+                nodedata[0][dn1] = (double)para->getParH(level)->press_SP[pos] / (double)3.0 * (double)para->getDensityRatio() * (double)para->getVelocityRatio() * (double)para->getVelocityRatio();
+                nodedata[1][dn1] = (double)para->getParH(level)->rho_SP[pos] / (double)3.0 * (double)para->getDensityRatio() * (double)para->getVelocityRatio() * (double)para->getVelocityRatio();
+                nodedata[2][dn1] = (double)para->getParH(level)->vx_SP[pos] * (double)para->getVelocityRatio();
+                nodedata[3][dn1] = (double)para->getParH(level)->vy_SP[pos] * (double)para->getVelocityRatio();
+                nodedata[4][dn1] = (double)para->getParH(level)->vz_SP[pos] * (double)para->getVelocityRatio();
+                nodedata[5][dn1] = (double)para->getParH(level)->geoSP[pos];
+
+                if (para->getCalcTurbulenceIntensity()) {
+                    nodedata[firstTurbNode    ][dn1] = (double)para->getParH(level)->vxx[pos];
+                    nodedata[firstTurbNode + 1][dn1] = (double)para->getParH(level)->vyy[pos];
+                    nodedata[firstTurbNode + 2][dn1] = (double)para->getParH(level)->vzz[pos];
+                    nodedata[firstTurbNode + 3][dn1] = (double)para->getParH(level)->vxy[pos];
+                    nodedata[firstTurbNode + 4][dn1] = (double)para->getParH(level)->vxz[pos];
+                    nodedata[firstTurbNode + 5][dn1] = (double)para->getParH(level)->vyz[pos];
+                }
+
+                //////////////////////////////////////////////////////////////////////////
+                number2 = para->getParH(level)->neighborX_SP[number1];
+                number3 = para->getParH(level)->neighborY_SP[number2];
+                number4 = para->getParH(level)->neighborY_SP[number1];
+                number5 = para->getParH(level)->neighborZ_SP[number1];
+                number6 = para->getParH(level)->neighborZ_SP[number2];
+                number7 = para->getParH(level)->neighborZ_SP[number3];
+                number8 = para->getParH(level)->neighborZ_SP[number4];
+                //////////////////////////////////////////////////////////////////////////
+                if (para->getParH(level)->geoSP[number2] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number3] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number4] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number5] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number6] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number7] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number8] != GEO_FLUID)  neighborsAreFluid = false;
+                //////////////////////////////////////////////////////////////////////////
+                if (number2 > endpos ||
+                    number3 > endpos ||
+                    number4 > endpos ||
+                    number5 > endpos ||
+                    number6 > endpos ||
+                    number7 > endpos ||
+                    number8 > endpos)  neighborsAreFluid = false;
+                //////////////////////////////////////////////////////////////////////////
+                dn2 = number2 - startpos;
+                dn3 = number3 - startpos;
+                dn4 = number4 - startpos;
+                dn5 = number5 - startpos;
+                dn6 = number6 - startpos;
+                dn7 = number7 - startpos;
+                dn8 = number8 - startpos;
+                //////////////////////////////////////////////////////////////////////////
+                if (isPeriodicCell(para, level, number2, number1, number3, number5))
+                    continue;
+                //////////////////////////////////////////////////////////////////////////
+                if (neighborsAreFluid)
+                    cells.push_back(makeUbTuple(dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8));
+            }
+        }
+        WbWriterVtkXmlBinary::getInstance()->writeOctsWithNodeData(fname[part], nodes, cells, nodedatanames, nodedata);
+    }
+}
+
+void FileWriter::writeUnstrucuredGridLTConc(std::shared_ptr<Parameter> para, int level, std::vector<std::string >& fname)
+{
+    std::vector< UbTupleFloat3 > nodes;
+    std::vector< UbTupleUInt8 > cells;
+    std::vector< std::string > nodedatanames;
+    nodedatanames.push_back("press");
+    nodedatanames.push_back("rho");
+    nodedatanames.push_back("vx1");
+    nodedatanames.push_back("vx2");
+    nodedatanames.push_back("vx3");
+    nodedatanames.push_back("geo");
+    nodedatanames.push_back("conc");
     unsigned int number1, number2, number3, number4, number5, number6, number7, number8;
     uint dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8;
     bool neighborsAreFluid;
@@ -208,6 +329,7 @@ void FileWriter::writeUnstrucuredGridLT(std::shared_ptr<Parameter> para, int lev
         nodedata[3].resize(sizeOfNodes);
         nodedata[4].resize(sizeOfNodes);
         nodedata[5].resize(sizeOfNodes);
+        nodedata[6].resize(sizeOfNodes);
         //////////////////////////////////////////////////////////////////////////
         for (unsigned int pos = startpos; pos < endpos; pos++)
         {
@@ -229,6 +351,7 @@ void FileWriter::writeUnstrucuredGridLT(std::shared_ptr<Parameter> para, int lev
                 nodedata[3][dn1] = (double)para->getParH(level)->vy_SP[pos] * (double)para->getVelocityRatio();
                 nodedata[4][dn1] = (double)para->getParH(level)->vz_SP[pos] * (double)para->getVelocityRatio();
                 nodedata[5][dn1] = (double)para->getParH(level)->geoSP[pos];
+                nodedata[6][dn1] = (double)para->getParH(level)->Conc[pos];
                 //////////////////////////////////////////////////////////////////////////
                 number2 = para->getParH(level)->neighborX_SP[number1];
                 number3 = para->getParH(level)->neighborY_SP[number2];
@@ -261,10 +384,10 @@ void FileWriter::writeUnstrucuredGridLT(std::shared_ptr<Parameter> para, int lev
                 dn6 = number6 - startpos;
                 dn7 = number7 - startpos;
                 dn8 = number8 - startpos;
-				//////////////////////////////////////////////////////////////////////////
-				if (isPeriodicCell(para, level, number2, number1, number3, number5))
-					continue;
-				//////////////////////////////////////////////////////////////////////////
+                //////////////////////////////////////////////////////////////////////////
+                if (isPeriodicCell(para, level, number2, number1, number3, number5))
+                    continue;
+                //////////////////////////////////////////////////////////////////////////
                 if (neighborsAreFluid)
                     cells.push_back(makeUbTuple(dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8));
             }
@@ -273,329 +396,223 @@ void FileWriter::writeUnstrucuredGridLT(std::shared_ptr<Parameter> para, int lev
     }
 }
 
-void FileWriter::writeUnstrucuredGridLTConc(std::shared_ptr<Parameter> para, int level, std::vector<std::string >& fname)
-{
-	std::vector< UbTupleFloat3 > nodes;
-	std::vector< UbTupleUInt8 > cells;
-	std::vector< std::string > nodedatanames;
-	nodedatanames.push_back("press");
-	nodedatanames.push_back("rho");
-	nodedatanames.push_back("vx1");
-	nodedatanames.push_back("vx2");
-	nodedatanames.push_back("vx3");
-	nodedatanames.push_back("geo");
-	nodedatanames.push_back("conc");
-	unsigned int number1, number2, number3, number4, number5, number6, number7, number8;
-	uint dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8;
-	bool neighborsAreFluid;
-	unsigned int startpos = 0;
-	unsigned int endpos = 0;
-	unsigned int sizeOfNodes = 0;
-	std::vector< std::vector< double > > nodedata(nodedatanames.size());
-
-	for (unsigned int part = 0; part < fname.size(); part++)
-	{
-		if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->size_Mat_SP)
-			sizeOfNodes = para->getParH(level)->size_Mat_SP - (part * para->getlimitOfNodesForVTK());
-		else
-			sizeOfNodes = para->getlimitOfNodesForVTK();
-
-		//////////////////////////////////////////////////////////////////////////
-		startpos = part * para->getlimitOfNodesForVTK();
-		endpos = startpos + sizeOfNodes;
-		//////////////////////////////////////////////////////////////////////////
-		cells.clear();
-		nodes.resize(sizeOfNodes);
-		nodedata[0].resize(sizeOfNodes);
-		nodedata[1].resize(sizeOfNodes);
-		nodedata[2].resize(sizeOfNodes);
-		nodedata[3].resize(sizeOfNodes);
-		nodedata[4].resize(sizeOfNodes);
-		nodedata[5].resize(sizeOfNodes);
-		nodedata[6].resize(sizeOfNodes);
-		//////////////////////////////////////////////////////////////////////////
-		for (unsigned int pos = startpos; pos < endpos; pos++)
-		{
-			if (para->getParH(level)->geoSP[pos] == GEO_FLUID)
-			{
-				//////////////////////////////////////////////////////////////////////////
-				double x1 = para->getParH(level)->coordX_SP[pos];
-				double x2 = para->getParH(level)->coordY_SP[pos];
-				double x3 = para->getParH(level)->coordZ_SP[pos];
-				//////////////////////////////////////////////////////////////////////////
-				number1 = pos;
-				dn1 = pos - startpos;
-				neighborsAreFluid = true;
-				//////////////////////////////////////////////////////////////////////////
-				nodes[dn1] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
-				nodedata[0][dn1] = (double)para->getParH(level)->press_SP[pos] / (double)3.0 * (double)para->getDensityRatio() * (double)para->getVelocityRatio() * (double)para->getVelocityRatio();
-				nodedata[1][dn1] = (double)para->getParH(level)->rho_SP[pos] / (double)3.0 * (double)para->getDensityRatio() * (double)para->getVelocityRatio() * (double)para->getVelocityRatio();
-				nodedata[2][dn1] = (double)para->getParH(level)->vx_SP[pos] * (double)para->getVelocityRatio();
-				nodedata[3][dn1] = (double)para->getParH(level)->vy_SP[pos] * (double)para->getVelocityRatio();
-				nodedata[4][dn1] = (double)para->getParH(level)->vz_SP[pos] * (double)para->getVelocityRatio();
-				nodedata[5][dn1] = (double)para->getParH(level)->geoSP[pos];
-				nodedata[6][dn1] = (double)para->getParH(level)->Conc[pos];
-				//////////////////////////////////////////////////////////////////////////
-				number2 = para->getParH(level)->neighborX_SP[number1];
-				number3 = para->getParH(level)->neighborY_SP[number2];
-				number4 = para->getParH(level)->neighborY_SP[number1];
-				number5 = para->getParH(level)->neighborZ_SP[number1];
-				number6 = para->getParH(level)->neighborZ_SP[number2];
-				number7 = para->getParH(level)->neighborZ_SP[number3];
-				number8 = para->getParH(level)->neighborZ_SP[number4];
-				//////////////////////////////////////////////////////////////////////////
-				if (para->getParH(level)->geoSP[number2] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number3] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number4] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number5] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number6] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number7] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number8] != GEO_FLUID)  neighborsAreFluid = false;
-				//////////////////////////////////////////////////////////////////////////
-				if (number2 > endpos ||
-					number3 > endpos ||
-					number4 > endpos ||
-					number5 > endpos ||
-					number6 > endpos ||
-					number7 > endpos ||
-					number8 > endpos)  neighborsAreFluid = false;
-				//////////////////////////////////////////////////////////////////////////
-				dn2 = number2 - startpos;
-				dn3 = number3 - startpos;
-				dn4 = number4 - startpos;
-				dn5 = number5 - startpos;
-				dn6 = number6 - startpos;
-				dn7 = number7 - startpos;
-				dn8 = number8 - startpos;
-				//////////////////////////////////////////////////////////////////////////
-				if (isPeriodicCell(para, level, number2, number1, number3, number5))
-					continue;
-				//////////////////////////////////////////////////////////////////////////
-				if (neighborsAreFluid)
-					cells.push_back(makeUbTuple(dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8));
-			}
-		}
-		WbWriterVtkXmlBinary::getInstance()->writeOctsWithNodeData(fname[part], nodes, cells, nodedatanames, nodedata);
-	}
-}
-
 void FileWriter::writeUnstrucuredGridMedianLT(std::shared_ptr<Parameter> para, int level, std::vector<std::string >& fname)
 {
-	std::vector< UbTupleFloat3 > nodes;
-	std::vector< UbTupleUInt8 > cells;
-	//std::vector< UbTupleUInt8 > cells2;
-	std::vector< std::string > nodedatanames;
-	nodedatanames.push_back("pressMed");
-	nodedatanames.push_back("rhoMed");
-	nodedatanames.push_back("vx1Med");
-	nodedatanames.push_back("vx2Med");
-	nodedatanames.push_back("vx3Med");
-	nodedatanames.push_back("geo");
-	unsigned int number1, number2, number3, number4, number5, number6, number7, number8;
-	unsigned int dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8;
-	bool neighborsFluid;
-	unsigned int startpos = 0;
-	unsigned int endpos = 0;
-	unsigned int sizeOfNodes = 0;
-	std::vector< std::vector< double > > nodedata(nodedatanames.size());
-
-	//printf("\n test for if... \n");
-	for (unsigned int part = 0; part < fname.size(); part++)
-	{
-		//printf("\n test in if I... \n");
-		//////////////////////////////////////////////////////////////////////////
-		if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->size_Mat_SP)
-		{
-			sizeOfNodes = para->getParH(level)->size_Mat_SP - (part * para->getlimitOfNodesForVTK());
-		}
-		else
-		{
-			sizeOfNodes = para->getlimitOfNodesForVTK();
-		}
-		//////////////////////////////////////////////////////////////////////////
-		startpos = part * para->getlimitOfNodesForVTK();
-		endpos = startpos + sizeOfNodes;
-		//////////////////////////////////////////////////////////////////////////
-		cells.clear();
-		nodes.resize(sizeOfNodes);
-		nodedata[0].resize(sizeOfNodes);
-		nodedata[1].resize(sizeOfNodes);
-		nodedata[2].resize(sizeOfNodes);
-		nodedata[3].resize(sizeOfNodes);
-		nodedata[4].resize(sizeOfNodes);
-		nodedata[5].resize(sizeOfNodes);
-		//////////////////////////////////////////////////////////////////////////
-		//printf("\n test in if II... \n");
-		for (unsigned int pos = startpos; pos < endpos; pos++)
-		{
-			if (para->getParH(level)->geoSP[pos] == GEO_FLUID)
-			{
-				//////////////////////////////////////////////////////////////////////////
-				double x1 = para->getParH(level)->coordX_SP[pos];
-				double x2 = para->getParH(level)->coordY_SP[pos];
-				double x3 = para->getParH(level)->coordZ_SP[pos];
-				//////////////////////////////////////////////////////////////////////////
-				number1 = pos;
-				dn1 = pos - startpos;
-				neighborsFluid = true;
-				//////////////////////////////////////////////////////////////////////////
-				nodes[dn1] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
-				nodedata[0][dn1] = para->getParH(level)->press_SP_Med_Out[pos] / 3.0f * para->getDensityRatio() * para->getVelocityRatio() * para->getVelocityRatio();
-				nodedata[1][dn1] = para->getParH(level)->rho_SP_Med_Out[pos] / 3.0f * para->getDensityRatio() * para->getVelocityRatio() * para->getVelocityRatio();
-				nodedata[2][dn1] = para->getParH(level)->vx_SP_Med_Out[pos] * para->getVelocityRatio();
-				nodedata[3][dn1] = para->getParH(level)->vy_SP_Med_Out[pos] * para->getVelocityRatio();
-				nodedata[4][dn1] = para->getParH(level)->vz_SP_Med_Out[pos] * para->getVelocityRatio();
-				nodedata[5][dn1] = (double)para->getParH(level)->geoSP[pos];
-				//////////////////////////////////////////////////////////////////////////
-				number2 = para->getParH(level)->neighborX_SP[number1];
-				number3 = para->getParH(level)->neighborY_SP[number2];
-				number4 = para->getParH(level)->neighborY_SP[number1];
-				number5 = para->getParH(level)->neighborZ_SP[number1];
-				number6 = para->getParH(level)->neighborZ_SP[number2];
-				number7 = para->getParH(level)->neighborZ_SP[number3];
-				number8 = para->getParH(level)->neighborZ_SP[number4];
-				//////////////////////////////////////////////////////////////////////////
-				if (para->getParH(level)->geoSP[number2] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number3] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number4] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number5] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number6] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number7] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number8] != GEO_FLUID)  neighborsFluid = false;
-				//////////////////////////////////////////////////////////////////////////
-				if (number2 > endpos ||
-					number3 > endpos ||
-					number4 > endpos ||
-					number5 > endpos ||
-					number6 > endpos ||
-					number7 > endpos ||
-					number8 > endpos)  neighborsFluid = false;
-				//////////////////////////////////////////////////////////////////////////
-				dn2 = number2 - startpos;
-				dn3 = number3 - startpos;
-				dn4 = number4 - startpos;
-				dn5 = number5 - startpos;
-				dn6 = number6 - startpos;
-				dn7 = number7 - startpos;
-				dn8 = number8 - startpos;
-				//////////////////////////////////////////////////////////////////////////
-				if (isPeriodicCell(para, level, number2, number1, number3, number5))
-					continue;
-				//////////////////////////////////////////////////////////////////////////
-				if (neighborsFluid == true) cells.push_back(makeUbTuple(dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8));
-				//////////////////////////////////////////////////////////////////////////
-			}
-		}
-		WbWriterVtkXmlBinary::getInstance()->writeOctsWithNodeData(fname[part], nodes, cells, nodedatanames, nodedata);
-		//////////////////////////////////////////////////////////////////////////
-	}
+    std::vector< UbTupleFloat3 > nodes;
+    std::vector< UbTupleUInt8 > cells;
+    //std::vector< UbTupleUInt8 > cells2;
+    std::vector< std::string > nodedatanames;
+    nodedatanames.push_back("pressMed");
+    nodedatanames.push_back("rhoMed");
+    nodedatanames.push_back("vx1Med");
+    nodedatanames.push_back("vx2Med");
+    nodedatanames.push_back("vx3Med");
+    nodedatanames.push_back("geo");
+    unsigned int number1, number2, number3, number4, number5, number6, number7, number8;
+    unsigned int dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8;
+    bool neighborsFluid;
+    unsigned int startpos = 0;
+    unsigned int endpos = 0;
+    unsigned int sizeOfNodes = 0;
+    std::vector< std::vector< double > > nodedata(nodedatanames.size());
+
+    //printf("\n test for if... \n");
+    for (unsigned int part = 0; part < fname.size(); part++)
+    {
+        //printf("\n test in if I... \n");
+        //////////////////////////////////////////////////////////////////////////
+        if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->size_Mat_SP)
+        {
+            sizeOfNodes = para->getParH(level)->size_Mat_SP - (part * para->getlimitOfNodesForVTK());
+        }
+        else
+        {
+            sizeOfNodes = para->getlimitOfNodesForVTK();
+        }
+        //////////////////////////////////////////////////////////////////////////
+        startpos = part * para->getlimitOfNodesForVTK();
+        endpos = startpos + sizeOfNodes;
+        //////////////////////////////////////////////////////////////////////////
+        cells.clear();
+        nodes.resize(sizeOfNodes);
+        nodedata[0].resize(sizeOfNodes);
+        nodedata[1].resize(sizeOfNodes);
+        nodedata[2].resize(sizeOfNodes);
+        nodedata[3].resize(sizeOfNodes);
+        nodedata[4].resize(sizeOfNodes);
+        nodedata[5].resize(sizeOfNodes);
+        //////////////////////////////////////////////////////////////////////////
+        //printf("\n test in if II... \n");
+        for (unsigned int pos = startpos; pos < endpos; pos++)
+        {
+            if (para->getParH(level)->geoSP[pos] == GEO_FLUID)
+            {
+                //////////////////////////////////////////////////////////////////////////
+                double x1 = para->getParH(level)->coordX_SP[pos];
+                double x2 = para->getParH(level)->coordY_SP[pos];
+                double x3 = para->getParH(level)->coordZ_SP[pos];
+                //////////////////////////////////////////////////////////////////////////
+                number1 = pos;
+                dn1 = pos - startpos;
+                neighborsFluid = true;
+                //////////////////////////////////////////////////////////////////////////
+                nodes[dn1] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+                nodedata[0][dn1] = para->getParH(level)->press_SP_Med_Out[pos] / 3.0f * para->getDensityRatio() * para->getVelocityRatio() * para->getVelocityRatio();
+                nodedata[1][dn1] = para->getParH(level)->rho_SP_Med_Out[pos] / 3.0f * para->getDensityRatio() * para->getVelocityRatio() * para->getVelocityRatio();
+                nodedata[2][dn1] = para->getParH(level)->vx_SP_Med_Out[pos] * para->getVelocityRatio();
+                nodedata[3][dn1] = para->getParH(level)->vy_SP_Med_Out[pos] * para->getVelocityRatio();
+                nodedata[4][dn1] = para->getParH(level)->vz_SP_Med_Out[pos] * para->getVelocityRatio();
+                nodedata[5][dn1] = (double)para->getParH(level)->geoSP[pos];
+                //////////////////////////////////////////////////////////////////////////
+                number2 = para->getParH(level)->neighborX_SP[number1];
+                number3 = para->getParH(level)->neighborY_SP[number2];
+                number4 = para->getParH(level)->neighborY_SP[number1];
+                number5 = para->getParH(level)->neighborZ_SP[number1];
+                number6 = para->getParH(level)->neighborZ_SP[number2];
+                number7 = para->getParH(level)->neighborZ_SP[number3];
+                number8 = para->getParH(level)->neighborZ_SP[number4];
+                //////////////////////////////////////////////////////////////////////////
+                if (para->getParH(level)->geoSP[number2] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number3] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number4] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number5] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number6] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number7] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number8] != GEO_FLUID)  neighborsFluid = false;
+                //////////////////////////////////////////////////////////////////////////
+                if (number2 > endpos ||
+                    number3 > endpos ||
+                    number4 > endpos ||
+                    number5 > endpos ||
+                    number6 > endpos ||
+                    number7 > endpos ||
+                    number8 > endpos)  neighborsFluid = false;
+                //////////////////////////////////////////////////////////////////////////
+                dn2 = number2 - startpos;
+                dn3 = number3 - startpos;
+                dn4 = number4 - startpos;
+                dn5 = number5 - startpos;
+                dn6 = number6 - startpos;
+                dn7 = number7 - startpos;
+                dn8 = number8 - startpos;
+                //////////////////////////////////////////////////////////////////////////
+                if (isPeriodicCell(para, level, number2, number1, number3, number5))
+                    continue;
+                //////////////////////////////////////////////////////////////////////////
+                if (neighborsFluid == true) cells.push_back(makeUbTuple(dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8));
+                //////////////////////////////////////////////////////////////////////////
+            }
+        }
+        WbWriterVtkXmlBinary::getInstance()->writeOctsWithNodeData(fname[part], nodes, cells, nodedatanames, nodedata);
+        //////////////////////////////////////////////////////////////////////////
+    }
 }
 
 void FileWriter::writeUnstrucuredGridMedianLTConc(std::shared_ptr<Parameter> para, int level, std::vector<std::string >& fname)
 {
-	std::vector< UbTupleFloat3 > nodes;
-	std::vector< UbTupleUInt8 > cells;
-	std::vector< std::string > nodedatanames;
-	nodedatanames.push_back("concMed");
-	nodedatanames.push_back("pressMed");
-	nodedatanames.push_back("rhoMed");
-	nodedatanames.push_back("vx1Med");
-	nodedatanames.push_back("vx2Med");
-	nodedatanames.push_back("vx3Med");
-	nodedatanames.push_back("geo");
-	uint number1, number2, number3, number4, number5, number6, number7, number8;
-	uint dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8;
-	bool neighborsFluid;
-	uint startpos = 0;
-	uint endpos = 0;
-	uint sizeOfNodes = 0;
-	std::vector< std::vector< double > > nodedata(nodedatanames.size());
-
-	for (unsigned int part = 0; part < fname.size(); part++)
-	{
-		if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->size_Mat_SP)
-			sizeOfNodes = para->getParH(level)->size_Mat_SP - (part * para->getlimitOfNodesForVTK());
-		else
-			sizeOfNodes = para->getlimitOfNodesForVTK();
-		//////////////////////////////////////////////////////////////////////////
-		startpos = part * para->getlimitOfNodesForVTK();
-		endpos = startpos + sizeOfNodes;
-		//////////////////////////////////////////////////////////////////////////
-		cells.clear();
-		nodes.resize(sizeOfNodes);
-		nodedata[0].resize(sizeOfNodes);
-		nodedata[1].resize(sizeOfNodes);
-		nodedata[2].resize(sizeOfNodes);
-		nodedata[3].resize(sizeOfNodes);
-		nodedata[4].resize(sizeOfNodes);
-		nodedata[5].resize(sizeOfNodes);
-		nodedata[6].resize(sizeOfNodes);
-		//////////////////////////////////////////////////////////////////////////
-		for (unsigned int pos = startpos; pos < endpos; pos++)
-		{
-			if (para->getParH(level)->geoSP[pos] == GEO_FLUID)
-			{
-				//////////////////////////////////////////////////////////////////////////
-				double x1 = para->getParH(level)->coordX_SP[pos];
-				double x2 = para->getParH(level)->coordY_SP[pos];
-				double x3 = para->getParH(level)->coordZ_SP[pos];
-				//////////////////////////////////////////////////////////////////////////
-				number1 = pos;
-				dn1 = pos - startpos;
-				neighborsFluid = true;
-				//////////////////////////////////////////////////////////////////////////
-				nodes[dn1] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
-				nodedata[0][dn1] = (double)para->getParH(level)->Conc_Med_Out[pos];
-				nodedata[1][dn1] = (double)para->getParH(level)->press_SP_Med_Out[pos] / 3.0f * para->getDensityRatio() * para->getVelocityRatio() * para->getVelocityRatio();
-				nodedata[2][dn1] = (double)para->getParH(level)->rho_SP_Med_Out[pos] / 3.0f * para->getDensityRatio() * para->getVelocityRatio() * para->getVelocityRatio();
-				nodedata[3][dn1] = (double)para->getParH(level)->vx_SP_Med_Out[pos] * para->getVelocityRatio();
-				nodedata[4][dn1] = (double)para->getParH(level)->vy_SP_Med_Out[pos] * para->getVelocityRatio();
-				nodedata[5][dn1] = (double)para->getParH(level)->vz_SP_Med_Out[pos] * para->getVelocityRatio();
-				nodedata[6][dn1] = (double)para->getParH(level)->geoSP[pos];
-				//////////////////////////////////////////////////////////////////////////
-				number2 = para->getParH(level)->neighborX_SP[number1];
-				number3 = para->getParH(level)->neighborY_SP[number2];
-				number4 = para->getParH(level)->neighborY_SP[number1];
-				number5 = para->getParH(level)->neighborZ_SP[number1];
-				number6 = para->getParH(level)->neighborZ_SP[number2];
-				number7 = para->getParH(level)->neighborZ_SP[number3];
-				number8 = para->getParH(level)->neighborZ_SP[number4];
-				//////////////////////////////////////////////////////////////////////////
-				if (para->getParH(level)->geoSP[number2] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number3] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number4] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number5] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number6] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number7] != GEO_FLUID ||
-					para->getParH(level)->geoSP[number8] != GEO_FLUID)  neighborsFluid = false;
-				//////////////////////////////////////////////////////////////////////////
-				if (number2 > endpos ||
-					number3 > endpos ||
-					number4 > endpos ||
-					number5 > endpos ||
-					number6 > endpos ||
-					number7 > endpos ||
-					number8 > endpos)  neighborsFluid = false;
-				//////////////////////////////////////////////////////////////////////////
-				dn2 = number2 - startpos;
-				dn3 = number3 - startpos;
-				dn4 = number4 - startpos;
-				dn5 = number5 - startpos;
-				dn6 = number6 - startpos;
-				dn7 = number7 - startpos;
-				dn8 = number8 - startpos;
-				//////////////////////////////////////////////////////////////////////////
-				if (isPeriodicCell(para, level, number2, number1, number3, number5))
-					continue;
-				//////////////////////////////////////////////////////////////////////////
-				if (neighborsFluid) 
-					cells.push_back(makeUbTuple(dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8));
-				//////////////////////////////////////////////////////////////////////////
-			}
-		}
-		WbWriterVtkXmlBinary::getInstance()->writeOctsWithNodeData(fname[part], nodes, cells, nodedatanames, nodedata);
-		//////////////////////////////////////////////////////////////////////////
-	}
+    std::vector< UbTupleFloat3 > nodes;
+    std::vector< UbTupleUInt8 > cells;
+    std::vector< std::string > nodedatanames;
+    nodedatanames.push_back("concMed");
+    nodedatanames.push_back("pressMed");
+    nodedatanames.push_back("rhoMed");
+    nodedatanames.push_back("vx1Med");
+    nodedatanames.push_back("vx2Med");
+    nodedatanames.push_back("vx3Med");
+    nodedatanames.push_back("geo");
+    uint number1, number2, number3, number4, number5, number6, number7, number8;
+    uint dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8;
+    bool neighborsFluid;
+    uint startpos = 0;
+    uint endpos = 0;
+    uint sizeOfNodes = 0;
+    std::vector< std::vector< double > > nodedata(nodedatanames.size());
+
+    for (unsigned int part = 0; part < fname.size(); part++)
+    {
+        if (((part + 1)*para->getlimitOfNodesForVTK()) > para->getParH(level)->size_Mat_SP)
+            sizeOfNodes = para->getParH(level)->size_Mat_SP - (part * para->getlimitOfNodesForVTK());
+        else
+            sizeOfNodes = para->getlimitOfNodesForVTK();
+        //////////////////////////////////////////////////////////////////////////
+        startpos = part * para->getlimitOfNodesForVTK();
+        endpos = startpos + sizeOfNodes;
+        //////////////////////////////////////////////////////////////////////////
+        cells.clear();
+        nodes.resize(sizeOfNodes);
+        nodedata[0].resize(sizeOfNodes);
+        nodedata[1].resize(sizeOfNodes);
+        nodedata[2].resize(sizeOfNodes);
+        nodedata[3].resize(sizeOfNodes);
+        nodedata[4].resize(sizeOfNodes);
+        nodedata[5].resize(sizeOfNodes);
+        nodedata[6].resize(sizeOfNodes);
+        //////////////////////////////////////////////////////////////////////////
+        for (unsigned int pos = startpos; pos < endpos; pos++)
+        {
+            if (para->getParH(level)->geoSP[pos] == GEO_FLUID)
+            {
+                //////////////////////////////////////////////////////////////////////////
+                double x1 = para->getParH(level)->coordX_SP[pos];
+                double x2 = para->getParH(level)->coordY_SP[pos];
+                double x3 = para->getParH(level)->coordZ_SP[pos];
+                //////////////////////////////////////////////////////////////////////////
+                number1 = pos;
+                dn1 = pos - startpos;
+                neighborsFluid = true;
+                //////////////////////////////////////////////////////////////////////////
+                nodes[dn1] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+                nodedata[0][dn1] = (double)para->getParH(level)->Conc_Med_Out[pos];
+                nodedata[1][dn1] = (double)para->getParH(level)->press_SP_Med_Out[pos] / 3.0f * para->getDensityRatio() * para->getVelocityRatio() * para->getVelocityRatio();
+                nodedata[2][dn1] = (double)para->getParH(level)->rho_SP_Med_Out[pos] / 3.0f * para->getDensityRatio() * para->getVelocityRatio() * para->getVelocityRatio();
+                nodedata[3][dn1] = (double)para->getParH(level)->vx_SP_Med_Out[pos] * para->getVelocityRatio();
+                nodedata[4][dn1] = (double)para->getParH(level)->vy_SP_Med_Out[pos] * para->getVelocityRatio();
+                nodedata[5][dn1] = (double)para->getParH(level)->vz_SP_Med_Out[pos] * para->getVelocityRatio();
+                nodedata[6][dn1] = (double)para->getParH(level)->geoSP[pos];
+                //////////////////////////////////////////////////////////////////////////
+                number2 = para->getParH(level)->neighborX_SP[number1];
+                number3 = para->getParH(level)->neighborY_SP[number2];
+                number4 = para->getParH(level)->neighborY_SP[number1];
+                number5 = para->getParH(level)->neighborZ_SP[number1];
+                number6 = para->getParH(level)->neighborZ_SP[number2];
+                number7 = para->getParH(level)->neighborZ_SP[number3];
+                number8 = para->getParH(level)->neighborZ_SP[number4];
+                //////////////////////////////////////////////////////////////////////////
+                if (para->getParH(level)->geoSP[number2] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number3] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number4] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number5] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number6] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number7] != GEO_FLUID ||
+                    para->getParH(level)->geoSP[number8] != GEO_FLUID)  neighborsFluid = false;
+                //////////////////////////////////////////////////////////////////////////
+                if (number2 > endpos ||
+                    number3 > endpos ||
+                    number4 > endpos ||
+                    number5 > endpos ||
+                    number6 > endpos ||
+                    number7 > endpos ||
+                    number8 > endpos)  neighborsFluid = false;
+                //////////////////////////////////////////////////////////////////////////
+                dn2 = number2 - startpos;
+                dn3 = number3 - startpos;
+                dn4 = number4 - startpos;
+                dn5 = number5 - startpos;
+                dn6 = number6 - startpos;
+                dn7 = number7 - startpos;
+                dn8 = number8 - startpos;
+                //////////////////////////////////////////////////////////////////////////
+                if (isPeriodicCell(para, level, number2, number1, number3, number5))
+                    continue;
+                //////////////////////////////////////////////////////////////////////////
+                if (neighborsFluid) 
+                    cells.push_back(makeUbTuple(dn1, dn2, dn3, dn4, dn5, dn6, dn7, dn8));
+                //////////////////////////////////////////////////////////////////////////
+            }
+        }
+        WbWriterVtkXmlBinary::getInstance()->writeOctsWithNodeData(fname[part], nodes, cells, nodedatanames, nodedata);
+        //////////////////////////////////////////////////////////////////////////
+    }
 }
 //////////////////////////////////////////////////////////////////////////
 
@@ -605,5 +622,3 @@ void FileWriter::writeUnstrucuredGridMedianLTConc(std::shared_ptr<Parameter> par
 
 
 
-
-
diff --git a/src/gpu/VirtualFluids_GPU/Output/FileWriter.h b/src/gpu/VirtualFluids_GPU/Output/FileWriter.h
index 7f5c3c2d27f852b72966d3a837952c70a3fcf54e..f0983b8987d85a21668d801e4b8c7260e118adf1 100644
--- a/src/gpu/VirtualFluids_GPU/Output/FileWriter.h
+++ b/src/gpu/VirtualFluids_GPU/Output/FileWriter.h
@@ -11,6 +11,7 @@
 
 class Parameter;
 class CudaMemoryManager;
+struct PN27;
 
 class FileWriter : public DataWriter
 {
@@ -23,7 +24,8 @@ public:
 private:
 	void VIRTUALFLUIDS_GPU_EXPORT writeTimestep(std::shared_ptr<Parameter> para, unsigned int timestep, int level) override;
 	//void VIRTUALFLUIDS_GPU_EXPORT writeParticle(Parameter* para, unsigned int t);
-	void VIRTUALFLUIDS_GPU_EXPORT writeUnstrucuredGridLT(std::shared_ptr<Parameter> para, int level, std::vector<std::string >& fname);
+    void VIRTUALFLUIDS_GPU_EXPORT writeUnstrucuredGridLT(std::shared_ptr<Parameter> para, int level,
+                                                         std::vector<std::string> &fname);
 	void VIRTUALFLUIDS_GPU_EXPORT writeUnstrucuredGridLTConc(std::shared_ptr<Parameter> para, int level, std::vector<std::string >& fname);
 	void VIRTUALFLUIDS_GPU_EXPORT writeUnstrucuredGridMedianLT(std::shared_ptr<Parameter> para, int level, std::vector<std::string >& fname);
 	void VIRTUALFLUIDS_GPU_EXPORT writeUnstrucuredGridMedianLTConc(std::shared_ptr<Parameter> para, int level, std::vector<std::string >& fname);
diff --git a/src/gpu/VirtualFluids_GPU/Output/InterfaceDebugWriter.hpp b/src/gpu/VirtualFluids_GPU/Output/InterfaceDebugWriter.hpp
index 8d1337be8088f3daa55f03fc5fcf1e405c8d0b3d..dd07e4db5e970de5dda9fc5a7dbf395f1321f04f 100644
--- a/src/gpu/VirtualFluids_GPU/Output/InterfaceDebugWriter.hpp
+++ b/src/gpu/VirtualFluids_GPU/Output/InterfaceDebugWriter.hpp
@@ -1,604 +1,911 @@
 #ifndef INTERFACEDEBUG_HPP
 #define INTERFACEDEBUG_HPP
 
-#include <stdio.h>
 #include <fstream>
 #include <sstream>
+#include <stdio.h>
 // #include <math.h>
-#include <cmath>
-#include "LBM/LB.h"
+#include "Core/StringUtilities/StringUtil.h"
 #include "LBM/D3Q27.h"
+#include "LBM/LB.h"
 #include "Parameter/Parameter.h"
 #include "basics/utilities/UbSystem.h"
-#include "Core/StringUtilities/StringUtil.h"
 #include <basics/writer/WbWriterVtkXmlBinary.h>
+#include <cmath>
 
-
-//using namespace std;
+#include "VirtualFluids_GPU/Communication/Communicator.h"
 
 namespace InterfaceDebugWriter
 {
 
-    void writeGridInterfaceLines(Parameter* para, int level, const uint* coarse, const uint* fine, uint numberOfNodes, const std::string& name)
+void writeGridInterfaceLines(Parameter *para, int level, const uint *coarse, const uint *fine, uint numberOfNodes,
+                             const std::string &name)
+{
+    std::vector<UbTupleFloat3> nodes(numberOfNodes * 2);
+    std::vector<UbTupleInt2> cells(numberOfNodes);
+
+    int actualNodeNumber = 0;
+    for (uint u = 0; u < numberOfNodes; u++) {
+        const int posCoarse   = coarse[u];
+        const double x1Coarse = para->getParH(level)->coordX_SP[posCoarse];
+        const double x2Coarse = para->getParH(level)->coordY_SP[posCoarse];
+        const double x3Coarse = para->getParH(level)->coordZ_SP[posCoarse];
+
+        const int posFine   = fine[u];
+        const double x1Fine = para->getParH(level + 1)->coordX_SP[posFine];
+        const double x2Fine = para->getParH(level + 1)->coordY_SP[posFine];
+        const double x3Fine = para->getParH(level + 1)->coordZ_SP[posFine];
+
+        nodes[actualNodeNumber++] = makeUbTuple(float(x1Coarse), float(x2Coarse), float(x3Coarse));
+        nodes[actualNodeNumber++] = makeUbTuple(float(x1Fine), float(x2Fine), float(x3Fine));
+
+        cells[u] = makeUbTuple(actualNodeNumber - 2, actualNodeNumber - 1);
+    }
+    WbWriterVtkXmlBinary::getInstance()->writeLines(name, nodes, cells);
+}
+
+void writeInterfaceLinesDebugCF(Parameter *para)
+{
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        const std::string fileName = para->getFName() + "_" + StringUtil::toString<int>(level) + "_OffDebugCF.vtk";
+        writeGridInterfaceLines(para, level, para->getParH(level)->intCF.ICellCFC, para->getParH(level)->intCF.ICellCFF,
+                                para->getParH(level)->K_CF, fileName);
+    }
+}
+
+void writeInterfaceLinesDebugFC(Parameter *para)
+{
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        const std::string fileName = para->getFName() + "_" + StringUtil::toString<int>(level) + "_OffDebugFC.vtk";
+        writeGridInterfaceLines(para, level, para->getParH(level)->intFC.ICellFCC, para->getParH(level)->intFC.ICellFCF,
+                                para->getParH(level)->K_FC, fileName);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+void writeGridInterfaceLinesNeighbors(Parameter *para, int level, const uint *interfaceIndices, uint numberOfNodes,
+                                      const std::string &name)
+{
+    std::vector<UbTupleFloat3> nodes(numberOfNodes * 2);
+    std::vector<UbTupleInt2> cells(numberOfNodes);
+
+    int actualNodeNumber = 0;
+    for (uint u = 0; u < numberOfNodes; u++) {
+        const int pos   = interfaceIndices[u];
+        const double x1 = para->getParH(level)->coordX_SP[pos];
+        const double x2 = para->getParH(level)->coordY_SP[pos];
+        const double x3 = para->getParH(level)->coordZ_SP[pos];
+
+        const double x1Neighbor = para->getParH(level)->coordX_SP[para->getParH(level)->neighborX_SP[pos]];
+        const double x2Neighbor = para->getParH(level)->coordY_SP[para->getParH(level)->neighborY_SP[pos]];
+        const double x3Neighbor = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborZ_SP[pos]];
+
+        nodes[actualNodeNumber++] = (makeUbTuple(float(x1), float(x2), float(x3)));
+        nodes[actualNodeNumber++] = (makeUbTuple(float(x1Neighbor), float(x2Neighbor), float(x3Neighbor)));
+
+        cells[u] = makeUbTuple(actualNodeNumber - 2, actualNodeNumber - 1);
+    }
+    WbWriterVtkXmlBinary::getInstance()->writeLines(name, nodes, cells);
+}
+
+void writeInterfaceLinesDebugCFCneighbor(Parameter *para)
+{
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        std::string filename = para->getFName() + "_" + StringUtil::toString<int>(level) + "_CFCneighbor.vtk";
+        writeGridInterfaceLinesNeighbors(para, level, para->getParH(level)->intCF.ICellCFC, para->getParH(level)->K_CF,
+                                         filename);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+void writeInterfaceLinesDebugCFFneighbor(Parameter *para)
+{
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        std::string filename = para->getFName() + "_" + StringUtil::toString<int>(level) + "_CFFneighbor.vtk";
+        writeGridInterfaceLinesNeighbors(para, level + 1, para->getParH(level)->intCF.ICellCFF,
+                                         para->getParH(level)->K_CF, filename);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+void writeInterfaceLinesDebugFCCneighbor(Parameter *para)
+{
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        std::string filename = para->getFName() + "_" + StringUtil::toString<int>(level) + "_FCCneighbor.vtk";
+        writeGridInterfaceLinesNeighbors(para, level, para->getParH(level)->intFC.ICellFCC, para->getParH(level)->K_FC,
+                                         filename);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+void writeInterfaceLinesDebugFCFneighbor(Parameter *para)
+{
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        std::string filename = para->getFName() + "_" + StringUtil::toString<int>(level) + "_FCFneighbor.vtk";
+        writeGridInterfaceLinesNeighbors(para, level + 1, para->getParH(level)->intFC.ICellFCF,
+                                         para->getParH(level)->K_FC, filename);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+void writeInterfaceLinesDebugOff(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    std::vector<UbTupleInt2> cellsVec;
+    int nodeNumberVec = 0;
+
+    for (int level = 0; level < para->getMaxLevel(); level++) // evtl. Maxlevel + 1
     {
-        std::vector<UbTupleFloat3> nodes(numberOfNodes * 2);
-        std::vector<UbTupleInt2> cells(numberOfNodes);
-
-        int actualNodeNumber = 0;
-        for (uint u = 0; u < numberOfNodes; u++)
-        {
-            const int posCoarse = coarse[u];
-            const double x1Coarse = para->getParH(level)->coordX_SP[posCoarse];
-            const double x2Coarse = para->getParH(level)->coordY_SP[posCoarse];
-            const double x3Coarse = para->getParH(level)->coordZ_SP[posCoarse];
-
-            const int posFine = fine[u];
-            const double x1Fine = para->getParH(level + 1)->coordX_SP[posFine];
-            const double x2Fine = para->getParH(level + 1)->coordY_SP[posFine];
-            const double x3Fine = para->getParH(level + 1)->coordZ_SP[posFine];
-
-            nodes[actualNodeNumber++] = makeUbTuple(float(x1Coarse), float(x2Coarse), float(x3Coarse));
-            nodes[actualNodeNumber++] = makeUbTuple(float(x1Fine), float(x2Fine), float(x3Fine));
-
-            cells[u] = makeUbTuple(actualNodeNumber - 2, actualNodeNumber - 1);
+        nodeNumberVec += (int)para->getParH(level)->K_CF;
+    }
+    nodesVec.resize(nodeNumberVec * 8);
+    int nodeCount = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        for (unsigned int u = 0; u < para->getParH(level)->K_CF; u++) {
+            double xoff = para->getParH(level)->offCF.xOffCF[u];
+            double yoff = para->getParH(level)->offCF.yOffCF[u];
+            double zoff = para->getParH(level)->offCF.zOffCF[u];
+
+            int posFine = para->getParH(level)->intCF.ICellCFF[u];
+
+            double x1Fine = para->getParH(level + 1)->coordX_SP[posFine];
+            double x2Fine = para->getParH(level + 1)->coordY_SP[posFine];
+            double x3Fine = para->getParH(level + 1)->coordZ_SP[posFine];
+
+            double x1 = x1Fine + xoff;
+            double x2 = x2Fine + yoff;
+            double x3 = x3Fine + zoff;
+
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1Fine), (float)(x2Fine), (float)(x3Fine)));
+
+            cellsVec.push_back(makeUbTuple(nodeCount - 2, nodeCount - 1));
+        }
+        std::string filenameVec = para->getFName() + "_" + StringUtil::toString<int>(level) + "_OffDebugCF_Offs.vtk";
+        WbWriterVtkXmlBinary::getInstance()->writeLines(filenameVec, nodesVec, cellsVec);
+        cellsVec.clear();
+        nodesVec.clear();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+void writeInterfacePointsDebugCFC(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec2;
+    int nodeNumberVec = 0;
+
+    for (int level = 0; level < para->getMaxLevel(); level++) // evtl. Maxlevel + 1
+    {
+        nodeNumberVec += (int)para->getParH(level)->K_CF;
+    }
+    nodesVec2.resize(nodeNumberVec * 8);
+    int nodeCount2 = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        for (unsigned int u = 0; u < para->getParH(level)->K_CF; u++) {
+            int pos = para->getParH(level)->intCF.ICellCFC[u];
+
+            double x1 = para->getParH(level)->coordX_SP[pos];
+            double x2 = para->getParH(level)->coordY_SP[pos];
+            double x3 = para->getParH(level)->coordZ_SP[pos];
+
+            nodesVec2[nodeCount2++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
         }
-        WbWriterVtkXmlBinary::getInstance()->writeLines(name, nodes, cells);
+        std::string filenameVec2 = para->getFName() + "_" + StringUtil::toString<int>(level) + "_OffDebugPointsCF.vtk";
+        WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec2, nodesVec2);
     }
+}
 
+//////////////////////////////////////////////////////////////////////////
 
-    void writeInterfaceLinesDebugCF(Parameter* para)
+void writeBcPointsDebug(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec2;
+    int nodeNumberVec = 0;
+
+    for (int level = 0; level <= para->getMaxLevel(); level++) // evtl. Maxlevel + 1
     {
-		for (int level = 0; level < para->getMaxLevel(); level++)
-		{
-            const std::string fileName = para->getFName() + "_" + StringUtil::toString<int>(level) + "_OffDebugCF.vtk";
-            writeGridInterfaceLines(para, level, para->getParH(level)->intCF.ICellCFC, para->getParH(level)->intCF.ICellCFF, para->getParH(level)->K_CF, fileName);
-		}
-	}
+        nodeNumberVec += (int)para->getParH(level)->QWall.kQ;
+    }
+    nodesVec2.resize(nodeNumberVec * 8);
+    int nodeCount2 = 0;
+    for (int level = 0; level <= para->getMaxLevel(); level++) {
+        for (int u = 0; u < para->getParH(level)->QWall.kQ; u++) {
+            int pos = para->getParH(level)->QWall.k[u];
+
+            double x1 = para->getParH(level)->coordX_SP[pos];
+            double x2 = para->getParH(level)->coordY_SP[pos];
+            double x3 = para->getParH(level)->coordZ_SP[pos];
 
+            nodesVec2[nodeCount2++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+        }
+        std::string filenameVec2 = para->getFName() + "_PointsBc_" + StringUtil::toString<int>(level);
+        WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec2, nodesVec2);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+void writePressPointsDebug(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    int nodeNumberVec = 0;
 
-	void writeInterfaceLinesDebugFC(Parameter* para)
+    for (int level = 0; level <= para->getMaxLevel(); level++) // evtl. Maxlevel + 1
     {
-        for (int level = 0; level < para->getMaxLevel(); level++)
-        {
-            const std::string fileName = para->getFName() + "_" + StringUtil::toString<int>(level) + "_OffDebugFC.vtk";
-            writeGridInterfaceLines(para, level, para->getParH(level)->intFC.ICellFCC, para->getParH(level)->intFC.ICellFCF, para->getParH(level)->K_FC, fileName);
+        nodeNumberVec += (int)para->getParH(level)->QPress.kQ;
+    }
+    nodesVec.resize(nodeNumberVec);
+    int nodeCount2 = 0;
+    for (int level = 0; level <= para->getMaxLevel(); level++) {
+        for (int u = 0; u < para->getParH(level)->QPress.kQ; u++) {
+            int pos = para->getParH(level)->QPress.k[u];
+
+            double x1 = para->getParH(level)->coordX_SP[pos];
+            double x2 = para->getParH(level)->coordY_SP[pos];
+            double x3 = para->getParH(level)->coordZ_SP[pos];
+
+            nodesVec[nodeCount2++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+        }
+        std::string filenameVec = para->getFName() + "_PointsPress_" + StringUtil::toString<int>(level);
+        WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec, nodesVec);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+void writePressNeighborPointsDebug(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    int nodeNumberVec = 0;
+
+    for (int level = 0; level <= para->getMaxLevel(); level++) {
+        nodeNumberVec += (int)para->getParH(level)->QPress.kQ;
+    }
+    nodesVec.resize(nodeNumberVec);
+    int nodeCount2 = 0;
+    for (int level = 0; level <= para->getMaxLevel(); level++) {
+        for (int u = 0; u < para->getParH(level)->QPress.kQ; u++) {
+            int pos = para->getParH(level)->QPress.kN[u];
+
+            real x1 = para->getParH(level)->coordX_SP[pos];
+            real x2 = para->getParH(level)->coordY_SP[pos];
+            real x3 = para->getParH(level)->coordZ_SP[pos];
+
+            nodesVec[nodeCount2++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
         }
-	}
+        std::string filenameVec = para->getFName() + "_PointsPressNeighbor_" + StringUtil::toString<int>(level);
+        WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec, nodesVec);
+    }
+}
 
+//////////////////////////////////////////////////////////////////////////
+
+void writeNeighborXPointsDebug(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    int nodeNumberVec = 0;
 
-	//////////////////////////////////////////////////////////////////////////
-    void writeGridInterfaceLinesNeighbors(Parameter* para, int level, const uint* interfaceIndices, uint numberOfNodes, const std::string& name)
+    for (int level = 0; level <= para->getMaxLevel(); level++) {
+        nodeNumberVec += (int)para->getParH(level)->size_Mat_SP;
+    }
+    nodesVec.resize(nodeNumberVec);
+    int nodeCount2 = 0;
+    for (int level = 0; level <= para->getMaxLevel(); level++) {
+        for (unsigned int u = 0; u < para->getParH(level)->size_Mat_SP; u++) {
+            real x1 = para->getParH(level)->coordX_SP[para->getParH(level)->neighborX_SP[u]];
+            real x2 = para->getParH(level)->coordY_SP[para->getParH(level)->neighborX_SP[u]];
+            real x3 = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborX_SP[u]];
+
+            nodesVec[nodeCount2++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+        }
+        std::string filenameVec = para->getFName() + "_PointsNeighborX_" + StringUtil::toString<int>(level);
+        WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec, nodesVec);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+void writeNeighborXLinesDebug(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    std::vector<UbTupleInt2> cellsVec;
+    int nodeNumberVec = 0;
+
+    for (int level = 0; level < para->getMaxLevel(); level++) // evtl. Maxlevel + 1
     {
-        std::vector<UbTupleFloat3> nodes(numberOfNodes * 2);
-        std::vector<UbTupleInt2> cells(numberOfNodes);
-
-        int actualNodeNumber = 0;
-        for (uint u = 0; u < numberOfNodes; u++)
-        {
-            const int pos = interfaceIndices[u];
-            const double x1 = para->getParH(level)->coordX_SP[pos];
-            const double x2 = para->getParH(level)->coordY_SP[pos];
-            const double x3 = para->getParH(level)->coordZ_SP[pos];
-	
-            const double x1Neighbor = para->getParH(level)->coordX_SP[para->getParH(level)->neighborX_SP[pos]];
-            const double x2Neighbor = para->getParH(level)->coordY_SP[para->getParH(level)->neighborY_SP[pos]];
-            const double x3Neighbor = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborZ_SP[pos]];
-
-            nodes[actualNodeNumber++] = (makeUbTuple(float(x1), float(x2), float(x3)));
-            nodes[actualNodeNumber++] = (makeUbTuple(float(x1Neighbor), float(x2Neighbor), float(x3Neighbor)));
-
-            cells[u] = makeUbTuple(actualNodeNumber - 2, actualNodeNumber - 1);
+        nodeNumberVec += (int)para->getParH(level)->size_Mat_SP;
+    }
+    nodesVec.resize(nodeNumberVec * 2);
+    int nodeCount = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        for (unsigned int u = 0; u < para->getParH(level)->size_Mat_SP; u++) {
+            real x1  = para->getParH(level)->coordX_SP[u];
+            real x2  = para->getParH(level)->coordY_SP[u];
+            real x3  = para->getParH(level)->coordZ_SP[u];
+            real x1N = para->getParH(level)->coordX_SP[para->getParH(level)->neighborX_SP[u]];
+            real x2N = para->getParH(level)->coordY_SP[para->getParH(level)->neighborX_SP[u]];
+            real x3N = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborX_SP[u]];
+
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1N), (float)(x2N), (float)(x3N)));
+
+            if (para->getParH(level)->geoSP[u] == GEO_FLUID) {
+                cellsVec.push_back(makeUbTuple(nodeCount - 2, nodeCount - 1));
+            }
         }
-        WbWriterVtkXmlBinary::getInstance()->writeLines(name, nodes, cells);
+        std::string filenameVec = para->getFName() + "_" + StringUtil::toString<int>(level) + "_NeighborX_Lines.vtk";
+        WbWriterVtkXmlBinary::getInstance()->writeLines(filenameVec, nodesVec, cellsVec);
     }
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+void writeNeighborYPointsDebug(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    int nodeNumberVec = 0;
+
+    for (int level = 0; level <= para->getMaxLevel(); level++) {
+        nodeNumberVec += (int)para->getParH(level)->size_Mat_SP;
+    }
+    nodesVec.resize(nodeNumberVec);
+    int nodeCount2 = 0;
+    for (int level = 0; level <= para->getMaxLevel(); level++) {
+        for (unsigned int u = 0; u < para->getParH(level)->size_Mat_SP; u++) {
+            real x1 = para->getParH(level)->coordX_SP[para->getParH(level)->neighborY_SP[u]];
+            real x2 = para->getParH(level)->coordY_SP[para->getParH(level)->neighborY_SP[u]];
+            real x3 = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborY_SP[u]];
+
+            nodesVec[nodeCount2++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+        }
+        std::string filenameVec = para->getFName() + "_PointsNeighborY_" + StringUtil::toString<int>(level);
+        WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec, nodesVec);
+    }
+}
 
-	void writeInterfaceLinesDebugCFCneighbor(Parameter* para)
+//////////////////////////////////////////////////////////////////////////
+
+void writeNeighborYLinesDebug(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    std::vector<UbTupleInt2> cellsVec;
+    int nodeNumberVec = 0;
+
+    for (int level = 0; level < para->getMaxLevel(); level++) // evtl. Maxlevel + 1
     {
-		for (int level = 0; level < para->getMaxLevel(); level++)
-		{
-            std::string filename = para->getFName() + "_" + StringUtil::toString<int>(level) + "_CFCneighbor.vtk";
-            writeGridInterfaceLinesNeighbors(para, level, para->getParH(level)->intCF.ICellCFC, para->getParH(level)->K_CF, filename);
-		}
-	}
+        nodeNumberVec += (int)para->getParH(level)->size_Mat_SP;
+    }
+    nodesVec.resize(nodeNumberVec * 2);
+    int nodeCount = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        for (unsigned int u = 0; u < para->getParH(level)->size_Mat_SP; u++) {
+            real x1  = para->getParH(level)->coordX_SP[u];
+            real x2  = para->getParH(level)->coordY_SP[u];
+            real x3  = para->getParH(level)->coordZ_SP[u];
+            real x1N = para->getParH(level)->coordX_SP[para->getParH(level)->neighborY_SP[u]];
+            real x2N = para->getParH(level)->coordY_SP[para->getParH(level)->neighborY_SP[u]];
+            real x3N = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborY_SP[u]];
+
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1N), (float)(x2N), (float)(x3N)));
+
+            if (para->getParH(level)->geoSP[u] == GEO_FLUID) {
+                cellsVec.push_back(makeUbTuple(nodeCount - 2, nodeCount - 1));
+            }
+        }
+        std::string filenameVec = para->getFName() + "_" + StringUtil::toString<int>(level) + "_NeighborY_Lines.vtk";
+        WbWriterVtkXmlBinary::getInstance()->writeLines(filenameVec, nodesVec, cellsVec);
+    }
+}
 
+//////////////////////////////////////////////////////////////////////////
+
+void writeNeighborZPointsDebug(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    int nodeNumberVec = 0;
 
-	//////////////////////////////////////////////////////////////////////////
-	void writeInterfaceLinesDebugCFFneighbor(Parameter* para)
+    for (int level = 0; level <= para->getMaxLevel(); level++) {
+        nodeNumberVec += (int)para->getParH(level)->size_Mat_SP;
+    }
+    nodesVec.resize(nodeNumberVec);
+    int nodeCount2 = 0;
+    for (int level = 0; level <= para->getMaxLevel(); level++) {
+        for (unsigned int u = 0; u < para->getParH(level)->size_Mat_SP; u++) {
+            real x1 = para->getParH(level)->coordX_SP[para->getParH(level)->neighborZ_SP[u]];
+            real x2 = para->getParH(level)->coordY_SP[para->getParH(level)->neighborZ_SP[u]];
+            real x3 = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborZ_SP[u]];
+
+            nodesVec[nodeCount2++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+        }
+        std::string filenameVec = para->getFName() + "_PointsNeighborZ_" + StringUtil::toString<int>(level);
+        WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec, nodesVec);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+
+void writeNeighborZLinesDebug(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    std::vector<UbTupleInt2> cellsVec;
+    int nodeNumberVec = 0;
+
+    for (int level = 0; level < para->getMaxLevel(); level++) // evtl. Maxlevel + 1
     {
-        for (int level = 0; level < para->getMaxLevel(); level++)
-        {
-            std::string filename = para->getFName() + "_" + StringUtil::toString<int>(level) + "_CFFneighbor.vtk";
-            writeGridInterfaceLinesNeighbors(para, level + 1, para->getParH(level)->intCF.ICellCFF, para->getParH(level)->K_CF, filename);
+        nodeNumberVec += (int)para->getParH(level)->size_Mat_SP;
+    }
+    nodesVec.resize(nodeNumberVec * 2);
+    int nodeCount = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        for (unsigned int u = 0; u < para->getParH(level)->size_Mat_SP; u++) {
+            real x1  = para->getParH(level)->coordX_SP[u];
+            real x2  = para->getParH(level)->coordY_SP[u];
+            real x3  = para->getParH(level)->coordZ_SP[u];
+            real x1N = para->getParH(level)->coordX_SP[para->getParH(level)->neighborZ_SP[u]];
+            real x2N = para->getParH(level)->coordY_SP[para->getParH(level)->neighborZ_SP[u]];
+            real x3N = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborZ_SP[u]];
+
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1N), (float)(x2N), (float)(x3N)));
+
+            if (para->getParH(level)->geoSP[u] == GEO_FLUID) {
+                cellsVec.push_back(makeUbTuple(nodeCount - 2, nodeCount - 1));
+            }
         }
-	}
+        std::string filenameVec = para->getFName() + "_" + StringUtil::toString<int>(level) + "_NeighborZ_Lines.vtk";
+        WbWriterVtkXmlBinary::getInstance()->writeLines(filenameVec, nodesVec, cellsVec);
+    }
+}
 
+//////////////////////////////////////////////////////////////////////////
+
+void writeInterfaceCellsDebugCFC(Parameter *para)
+{
 
-	//////////////////////////////////////////////////////////////////////////
-	void writeInterfaceLinesDebugFCCneighbor(Parameter* para)
+    std::vector<UbTupleFloat3> nodesVec;
+    std::vector<UbTupleInt8> cellsVec;
+    int nodeNumberVec = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) // evtl. Maxlevel + 1
     {
-        for (int level = 0; level < para->getMaxLevel(); level++)
-        {
-            std::string filename = para->getFName() + "_" + StringUtil::toString<int>(level) + "_FCCneighbor.vtk";
-            writeGridInterfaceLinesNeighbors(para, level, para->getParH(level)->intFC.ICellFCC, para->getParH(level)->K_FC, filename);
+        nodeNumberVec += (int)para->getParH(level)->K_CF;
+    }
+    nodesVec.resize(nodeNumberVec * 8);
+    int nodeCount = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        for (unsigned int u = 0; u < para->getParH(level)->K_CF; u++) {
+            int pos = para->getParH(level)->intCF.ICellCFC[u];
+
+            double x1             = para->getParH(level)->coordX_SP[pos];
+            double x2             = para->getParH(level)->coordY_SP[pos];
+            double x3             = para->getParH(level)->coordZ_SP[pos];
+            double x1P            = para->getParH(level)->coordX_SP[para->getParH(level)->neighborX_SP[pos]];
+            double x2P            = para->getParH(level)->coordY_SP[para->getParH(level)->neighborY_SP[pos]];
+            double x3P            = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborZ_SP[pos]];
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1P), (float)(x2), (float)(x3)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1P), (float)(x2P), (float)(x3)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2P), (float)(x3)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3P)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1P), (float)(x2), (float)(x3P)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1P), (float)(x2P), (float)(x3P)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2P), (float)(x3P)));
+
+            cellsVec.push_back(makeUbTuple(nodeCount - 8, nodeCount - 7, nodeCount - 6, nodeCount - 5, nodeCount - 4,
+                                           nodeCount - 3, nodeCount - 2, nodeCount - 1));
         }
-	}
+        std::string filenameVec = para->getFName() + "_CellsCFC_" + StringUtil::toString<int>(level);
+        WbWriterVtkXmlBinary::getInstance()->writeOcts(filenameVec, nodesVec, cellsVec);
+    }
+}
 
+//////////////////////////////////////////////////////////////////////////
 
-	//////////////////////////////////////////////////////////////////////////
-	void writeInterfaceLinesDebugFCFneighbor(Parameter* para)
+void writeInterfaceCellsDebugCFF(Parameter *para)
+{
+
+    std::vector<UbTupleFloat3> nodesVec;
+    std::vector<UbTupleInt8> cellsVec;
+    int nodeNumberVec = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) // evtl. Maxlevel + 1
     {
-        for (int level = 0; level < para->getMaxLevel(); level++)
-        {
-            std::string filename = para->getFName() + "_" + StringUtil::toString<int>(level) + "_FCFneighbor.vtk";
-            writeGridInterfaceLinesNeighbors(para, level + 1, para->getParH(level)->intFC.ICellFCF, para->getParH(level)->K_FC, filename);
+        nodeNumberVec += (int)para->getParH(level)->K_CF;
+    }
+    nodesVec.resize(nodeNumberVec * 8);
+    int nodeCount = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        for (unsigned int u = 0; u < para->getParH(level)->K_CF; u++) {
+            int pos = para->getParH(level)->intCF.ICellCFF[u];
+
+            double x1             = para->getParH(level + 1)->coordX_SP[pos];
+            double x2             = para->getParH(level + 1)->coordY_SP[pos];
+            double x3             = para->getParH(level + 1)->coordZ_SP[pos];
+            double x1P            = para->getParH(level + 1)->coordX_SP[para->getParH(level + 1)->neighborX_SP[pos]];
+            double x2P            = para->getParH(level + 1)->coordY_SP[para->getParH(level + 1)->neighborY_SP[pos]];
+            double x3P            = para->getParH(level + 1)->coordZ_SP[para->getParH(level + 1)->neighborZ_SP[pos]];
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1P), (float)(x2), (float)(x3)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1P), (float)(x2P), (float)(x3)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2P), (float)(x3)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3P)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1P), (float)(x2), (float)(x3P)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1P), (float)(x2P), (float)(x3P)));
+            nodesVec[nodeCount++] = (makeUbTuple((float)(x1), (float)(x2P), (float)(x3P)));
+
+            cellsVec.push_back(makeUbTuple(nodeCount - 8, nodeCount - 7, nodeCount - 6, nodeCount - 5, nodeCount - 4,
+                                           nodeCount - 3, nodeCount - 2, nodeCount - 1));
         }
-	}
-
-
-	//////////////////////////////////////////////////////////////////////////
-	void writeInterfaceLinesDebugOff(Parameter* para){
-		std::vector< UbTupleFloat3 > nodesVec;
-		std::vector< UbTupleInt2 > cellsVec;
-		int nodeNumberVec = 0;
-
-		for (int level = 0; level < para->getMaxLevel(); level++) //evtl. Maxlevel + 1
-		{
-			nodeNumberVec += (int)para->getParH(level)->K_CF;
-		}
-		nodesVec.resize(nodeNumberVec*8);
-		int nodeCount = 0;
-		for (int level = 0; level < para->getMaxLevel(); level++)
-		{
-			for(unsigned int u=0;u<para->getParH(level)->K_CF;u++)
-			{
-				double xoff = para->getParH(level)->offCF.xOffCF[u];
-				double yoff = para->getParH(level)->offCF.yOffCF[u];
-				double zoff = para->getParH(level)->offCF.zOffCF[u];
+        std::string filenameVec = para->getFName() + "_CellsCFF_" + StringUtil::toString<int>(level);
+        WbWriterVtkXmlBinary::getInstance()->writeOcts(filenameVec, nodesVec, cellsVec);
+    }
+}
+
 
-				int posFine = para->getParH(level)->intCF.ICellCFF[u];
-
-				double x1Fine = para->getParH(level+1)->coordX_SP[posFine];
-				double x2Fine = para->getParH(level+1)->coordY_SP[posFine];
-				double x3Fine = para->getParH(level+1)->coordZ_SP[posFine];
-
-				double x1 = x1Fine + xoff;
-				double x2 = x2Fine + yoff;
-				double x3 = x3Fine + zoff;
-
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1),(float)(x2),(float)(x3) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1Fine),(float)(x2Fine),(float)(x3Fine) ) );
-
-				cellsVec.push_back( makeUbTuple(nodeCount-2,nodeCount-1) );
-
-			}
-			std::string filenameVec = para->getFName()+"_"+StringUtil::toString<int>(level)+"_OffDebugCF_Offs.vtk";
-			WbWriterVtkXmlBinary::getInstance()->writeLines(filenameVec,nodesVec,cellsVec);
-            cellsVec.clear();
-            nodesVec.clear();
-		}
-	}
 
 
-	//////////////////////////////////////////////////////////////////////////
 
 
-	void writeInterfacePointsDebugCFC(Parameter* para){
-		std::vector< UbTupleFloat3 > nodesVec2;
-		int nodeNumberVec = 0;
 
-		for (int level = 0; level < para->getMaxLevel(); level++) //evtl. Maxlevel + 1
-		{
-			nodeNumberVec += (int)para->getParH(level)->K_CF;
-		}
-		nodesVec2.resize(nodeNumberVec*8); 
-		int nodeCount2 = 0; 
-		for (int level = 0; level < para->getMaxLevel(); level++)
-		{
-			for(unsigned int u=0;u<para->getParH(level)->K_CF;u++)
-			{
-				int pos = para->getParH(level)->intCF.ICellCFC[u];
-
-				double x1 = para->getParH(level)->coordX_SP[pos];
-				double x2 = para->getParH(level)->coordY_SP[pos];
-				double x3 = para->getParH(level)->coordZ_SP[pos];
-
-				nodesVec2[nodeCount2++]=( makeUbTuple( (float)(x1),(float)(x2),(float)(x3) ) );
-
-			}
-			std::string filenameVec2 = para->getFName()+"_"+StringUtil::toString<int>(level)+"_OffDebugPointsCF.vtk";
-			WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec2,nodesVec2);
-		}
-	}
 
 
-	//////////////////////////////////////////////////////////////////////////
 
 
-	void writeBcPointsDebug(Parameter* para){
-		std::vector< UbTupleFloat3 > nodesVec2;
-		int nodeNumberVec = 0;
-
-		for (int level = 0; level <= para->getMaxLevel(); level++) //evtl. Maxlevel + 1
-		{
-			nodeNumberVec += (int)para->getParH(level)->QWall.kQ;
-		}
-		nodesVec2.resize(nodeNumberVec*8); 
-		int nodeCount2 = 0; 
-		for (int level = 0; level <= para->getMaxLevel(); level++)
-		{
-			for(int u=0;u<para->getParH(level)->QWall.kQ;u++)
-			{
-				int pos = para->getParH(level)->QWall.k[u];
-
-				double x1 = para->getParH(level)->coordX_SP[pos];
-				double x2 = para->getParH(level)->coordY_SP[pos];
-				double x3 = para->getParH(level)->coordZ_SP[pos];
-
-				nodesVec2[nodeCount2++]=( makeUbTuple( (float)(x1),(float)(x2),(float)(x3) ) );
-
-			}
-			std::string filenameVec2 = para->getFName()+"_PointsBc_"+StringUtil::toString<int>(level);
-			WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec2,nodesVec2);
-		}
-	}
-
-
-	//////////////////////////////////////////////////////////////////////////
-
-
-	void writePressPointsDebug(Parameter* para){
-		std::vector< UbTupleFloat3 > nodesVec;
-		int nodeNumberVec = 0;
-
-		for (int level = 0; level <= para->getMaxLevel(); level++) //evtl. Maxlevel + 1
-		{
-			nodeNumberVec += (int)para->getParH(level)->QPress.kQ;
-		}
-		nodesVec.resize(nodeNumberVec); 
-		int nodeCount2 = 0; 
-		for (int level = 0; level <= para->getMaxLevel(); level++)
-		{
-			for(int u=0;u<para->getParH(level)->QPress.kQ;u++)
-			{
-				int pos = para->getParH(level)->QPress.k[u];
-
-				double x1 = para->getParH(level)->coordX_SP[pos];
-				double x2 = para->getParH(level)->coordY_SP[pos];
-				double x3 = para->getParH(level)->coordZ_SP[pos];
-
-				nodesVec[nodeCount2++]=( makeUbTuple( (float)(x1),(float)(x2),(float)(x3) ) );
-
-			}
-			std::string filenameVec = para->getFName()+"_PointsPress_"+StringUtil::toString<int>(level);
-			WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec,nodesVec);
-		}
-	}
-
-
-	//////////////////////////////////////////////////////////////////////////
-
-
-	void writePressNeighborPointsDebug(Parameter* para){
-		std::vector< UbTupleFloat3 > nodesVec;
-		int nodeNumberVec = 0;
-
-		for (int level = 0; level <= para->getMaxLevel(); level++)
-		{
-			nodeNumberVec += (int)para->getParH(level)->QPress.kQ;
-		}
-		nodesVec.resize(nodeNumberVec); 
-		int nodeCount2 = 0; 
-		for (int level = 0; level <= para->getMaxLevel(); level++)
-		{
-			for(int u=0;u<para->getParH(level)->QPress.kQ;u++)
-			{
-				int pos = para->getParH(level)->QPress.kN[u];
-
-				real x1 = para->getParH(level)->coordX_SP[pos];
-				real x2 = para->getParH(level)->coordY_SP[pos];
-				real x3 = para->getParH(level)->coordZ_SP[pos];
-
-				nodesVec[nodeCount2++]=( makeUbTuple( (float)(x1),(float)(x2),(float)(x3) ) );
-			}
-			std::string filenameVec = para->getFName()+"_PointsPressNeighbor_"+StringUtil::toString<int>(level);
-			WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec,nodesVec);
-		}
-	}
-
-
-	//////////////////////////////////////////////////////////////////////////
-
-
-	void writeNeighborXPointsDebug(Parameter* para){
-		std::vector< UbTupleFloat3 > nodesVec;
-		int nodeNumberVec = 0;
-
-		for (int level = 0; level <= para->getMaxLevel(); level++)
-		{
-			nodeNumberVec += (int)para->getParH(level)->size_Mat_SP;
-		}
-		nodesVec.resize(nodeNumberVec); 
-		int nodeCount2 = 0;
-		for (int level = 0; level <= para->getMaxLevel(); level++)
-		{
-			for(unsigned int u=0;u<para->getParH(level)->size_Mat_SP;u++)
-			{
-				real x1 = para->getParH(level)->coordX_SP[para->getParH(level)->neighborX_SP[u]];
-				real x2 = para->getParH(level)->coordY_SP[para->getParH(level)->neighborX_SP[u]];
-				real x3 = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborX_SP[u]];
-
-				nodesVec[nodeCount2++]=( makeUbTuple( (float)(x1),(float)(x2),(float)(x3) ) );
-			}
-			std::string filenameVec = para->getFName()+"_PointsNeighborX_"+StringUtil::toString<int>(level);
-			WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec,nodesVec);
-		}
-	}
-
-
-	//////////////////////////////////////////////////////////////////////////
-
-
-	void writeNeighborXLinesDebug(Parameter* para){
-		std::vector< UbTupleFloat3 > nodesVec;
-		std::vector< UbTupleInt2 > cellsVec;
-		int nodeNumberVec = 0;
-
-		for (int level = 0; level < para->getMaxLevel(); level++) //evtl. Maxlevel + 1
-		{
-			nodeNumberVec += (int)para->getParH(level)->size_Mat_SP;
-		}
-		nodesVec.resize(nodeNumberVec*2);
-		int nodeCount = 0;
-		for (int level = 0; level < para->getMaxLevel(); level++)
-		{
-			for(unsigned int u=0;u<para->getParH(level)->size_Mat_SP;u++)
-			{
-				real x1 = para->getParH(level)->coordX_SP[u];
-				real x2 = para->getParH(level)->coordY_SP[u];
-				real x3 = para->getParH(level)->coordZ_SP[u];
-				real x1N = para->getParH(level)->coordX_SP[para->getParH(level)->neighborX_SP[u]];
-				real x2N = para->getParH(level)->coordY_SP[para->getParH(level)->neighborX_SP[u]];
-				real x3N = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborX_SP[u]];
-
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1),(float)(x2),(float)(x3) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1N),(float)(x2N),(float)(x3N) ) );
-
-				if (para->getParH(level)->geoSP[u]==GEO_FLUID)
-				{
-					cellsVec.push_back( makeUbTuple(nodeCount-2,nodeCount-1) );
-				}
-
-			}
-			std::string filenameVec = para->getFName()+"_"+StringUtil::toString<int>(level)+"_NeighborX_Lines.vtk";
-			WbWriterVtkXmlBinary::getInstance()->writeLines(filenameVec,nodesVec,cellsVec);
-		}
-	}
-
-
-	//////////////////////////////////////////////////////////////////////////
-
-
-	void writeNeighborYPointsDebug(Parameter* para){
-		std::vector< UbTupleFloat3 > nodesVec;
-		int nodeNumberVec = 0;
-
-		for (int level = 0; level <= para->getMaxLevel(); level++)
-		{
-			nodeNumberVec += (int)para->getParH(level)->size_Mat_SP;
-		}
-		nodesVec.resize(nodeNumberVec); 
-		int nodeCount2 = 0;
-		for (int level = 0; level <= para->getMaxLevel(); level++)
-		{
-			for(unsigned int u=0;u<para->getParH(level)->size_Mat_SP;u++)
-			{
-				real x1 = para->getParH(level)->coordX_SP[para->getParH(level)->neighborY_SP[u]];
-				real x2 = para->getParH(level)->coordY_SP[para->getParH(level)->neighborY_SP[u]];
-				real x3 = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborY_SP[u]];
-
-				nodesVec[nodeCount2++]=( makeUbTuple( (float)(x1),(float)(x2),(float)(x3) ) );
-			}
-			std::string filenameVec = para->getFName()+"_PointsNeighborY_"+StringUtil::toString<int>(level);
-			WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec,nodesVec);
-		}
-	}
-
-
-	//////////////////////////////////////////////////////////////////////////
-
-
-	void writeNeighborYLinesDebug(Parameter* para){
-		std::vector< UbTupleFloat3 > nodesVec;
-		std::vector< UbTupleInt2 > cellsVec;
-		int nodeNumberVec = 0;
-
-		for (int level = 0; level < para->getMaxLevel(); level++) //evtl. Maxlevel + 1
-		{
-			nodeNumberVec += (int)para->getParH(level)->size_Mat_SP;
-		}
-		nodesVec.resize(nodeNumberVec*2);
-		int nodeCount = 0;
-		for (int level = 0; level < para->getMaxLevel(); level++)
-		{
-			for(unsigned int u=0;u<para->getParH(level)->size_Mat_SP;u++)
-			{
-				real x1 = para->getParH(level)->coordX_SP[u];
-				real x2 = para->getParH(level)->coordY_SP[u];
-				real x3 = para->getParH(level)->coordZ_SP[u];
-				real x1N = para->getParH(level)->coordX_SP[para->getParH(level)->neighborY_SP[u]];
-				real x2N = para->getParH(level)->coordY_SP[para->getParH(level)->neighborY_SP[u]];
-				real x3N = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborY_SP[u]];
-
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1),(float)(x2),(float)(x3) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1N),(float)(x2N),(float)(x3N) ) );
-
-				if (para->getParH(level)->geoSP[u]==GEO_FLUID)
-				{
-					cellsVec.push_back( makeUbTuple(nodeCount-2,nodeCount-1) );
-				}
-
-			}
-			std::string filenameVec = para->getFName()+"_"+StringUtil::toString<int>(level)+"_NeighborY_Lines.vtk";
-			WbWriterVtkXmlBinary::getInstance()->writeLines(filenameVec,nodesVec,cellsVec);
-		}
-	}
-
-
-	//////////////////////////////////////////////////////////////////////////
-
-
-	void writeNeighborZPointsDebug(Parameter* para){
-		std::vector< UbTupleFloat3 > nodesVec;
-		int nodeNumberVec = 0;
-
-		for (int level = 0; level <= para->getMaxLevel(); level++)
-		{
-			nodeNumberVec += (int)para->getParH(level)->size_Mat_SP;
-		}
-		nodesVec.resize(nodeNumberVec); 
-		int nodeCount2 = 0;
-		for (int level = 0; level <= para->getMaxLevel(); level++)
-		{
-			for(unsigned int u=0;u<para->getParH(level)->size_Mat_SP;u++)
-			{
-				real x1 = para->getParH(level)->coordX_SP[para->getParH(level)->neighborZ_SP[u]];
-				real x2 = para->getParH(level)->coordY_SP[para->getParH(level)->neighborZ_SP[u]];
-				real x3 = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborZ_SP[u]];
-
-				nodesVec[nodeCount2++]=( makeUbTuple( (float)(x1),(float)(x2),(float)(x3) ) );
-			}
-			std::string filenameVec = para->getFName()+"_PointsNeighborZ_"+StringUtil::toString<int>(level);
-			WbWriterVtkXmlBinary::getInstance()->writeNodes(filenameVec,nodesVec);
-		}
-	}
-
-
-	//////////////////////////////////////////////////////////////////////////
-
-
-	void writeNeighborZLinesDebug(Parameter* para){
-		std::vector< UbTupleFloat3 > nodesVec;
-		std::vector< UbTupleInt2 > cellsVec;
-		int nodeNumberVec = 0;
-
-		for (int level = 0; level < para->getMaxLevel(); level++) //evtl. Maxlevel + 1
-		{
-			nodeNumberVec += (int)para->getParH(level)->size_Mat_SP;
-		}
-		nodesVec.resize(nodeNumberVec*2);
-		int nodeCount = 0;
-		for (int level = 0; level < para->getMaxLevel(); level++)
-		{
-			for(unsigned int u=0;u<para->getParH(level)->size_Mat_SP;u++)
-			{
-				real x1 = para->getParH(level)->coordX_SP[u];
-				real x2 = para->getParH(level)->coordY_SP[u];
-				real x3 = para->getParH(level)->coordZ_SP[u];
-				real x1N = para->getParH(level)->coordX_SP[para->getParH(level)->neighborZ_SP[u]];
-				real x2N = para->getParH(level)->coordY_SP[para->getParH(level)->neighborZ_SP[u]];
-				real x3N = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborZ_SP[u]];
-
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1),(float)(x2),(float)(x3) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1N),(float)(x2N),(float)(x3N) ) );
-
-				if (para->getParH(level)->geoSP[u]==GEO_FLUID)
-				{
-					cellsVec.push_back( makeUbTuple(nodeCount-2,nodeCount-1) );
-				}
-
-			}
-			std::string filenameVec = para->getFName()+"_"+StringUtil::toString<int>(level)+"_NeighborZ_Lines.vtk";
-			WbWriterVtkXmlBinary::getInstance()->writeLines(filenameVec,nodesVec,cellsVec);
-		}
-	}
-
-
-	//////////////////////////////////////////////////////////////////////////
-
-
-	void writeInterfaceCellsDebugCFC(Parameter* para){
-
-		std::vector< UbTupleFloat3 > nodesVec;
-		std::vector< UbTupleInt8 > cellsVec;
-		int nodeNumberVec = 0;
-		for (int level = 0; level < para->getMaxLevel(); level++) //evtl. Maxlevel + 1
-		{
-			nodeNumberVec += (int)para->getParH(level)->K_CF;
-		}
-		nodesVec.resize(nodeNumberVec*8);
-		int nodeCount = 0;
-		for (int level = 0; level < para->getMaxLevel(); level++)
-		{
-			for(unsigned int u=0;u<para->getParH(level)->K_CF;u++)
-			{
-				int pos  = para->getParH(level)->intCF.ICellCFC[u];
-
-				double x1  = para->getParH(level)->coordX_SP[pos];
-				double x2  = para->getParH(level)->coordY_SP[pos];
-				double x3  = para->getParH(level)->coordZ_SP[pos];
-				double x1P = para->getParH(level)->coordX_SP[para->getParH(level)->neighborX_SP[pos]];
-				double x2P = para->getParH(level)->coordY_SP[para->getParH(level)->neighborY_SP[pos]];
-				double x3P = para->getParH(level)->coordZ_SP[para->getParH(level)->neighborZ_SP[pos]];
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1 ),(float)(x2 ),(float)(x3 ) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1P),(float)(x2 ),(float)(x3 ) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1P),(float)(x2P),(float)(x3 ) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1 ),(float)(x2P),(float)(x3 ) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1 ),(float)(x2 ),(float)(x3P) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1P),(float)(x2 ),(float)(x3P) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1P),(float)(x2P),(float)(x3P) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1 ),(float)(x2P),(float)(x3P) ) );
-
-				cellsVec.push_back( makeUbTuple(nodeCount-8,nodeCount-7,nodeCount-6,nodeCount-5,nodeCount-4,nodeCount-3,nodeCount-2,nodeCount-1) );
-
-			}
-			std::string filenameVec = para->getFName()+"_CellsCFC_"+StringUtil::toString<int>(level);
-			WbWriterVtkXmlBinary::getInstance()->writeOcts(filenameVec,nodesVec,cellsVec);
-		}
-	}
-
-
-	//////////////////////////////////////////////////////////////////////////
-
-
-	void writeInterfaceCellsDebugCFF(Parameter* para){
-
-		std::vector< UbTupleFloat3 > nodesVec;
-		std::vector< UbTupleInt8 > cellsVec;
-		int nodeNumberVec = 0;
-		for (int level = 0; level < para->getMaxLevel(); level++) //evtl. Maxlevel + 1
-		{
-			nodeNumberVec += (int)para->getParH(level)->K_CF;
-		}
-		nodesVec.resize(nodeNumberVec*8);
-		int nodeCount = 0;
-		for (int level = 0; level < para->getMaxLevel(); level++)
-		{
-			for(unsigned int u=0;u<para->getParH(level)->K_CF;u++)
-			{
-				int pos  = para->getParH(level  )->intCF.ICellCFF[u];
-
-				double x1  = para->getParH(level+1)->coordX_SP[pos];
-				double x2  = para->getParH(level+1)->coordY_SP[pos];
-				double x3  = para->getParH(level+1)->coordZ_SP[pos];
-				double x1P = para->getParH(level+1)->coordX_SP[para->getParH(level+1)->neighborX_SP[pos]];
-				double x2P = para->getParH(level+1)->coordY_SP[para->getParH(level+1)->neighborY_SP[pos]];
-				double x3P = para->getParH(level+1)->coordZ_SP[para->getParH(level+1)->neighborZ_SP[pos]];
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1 ),(float)(x2 ),(float)(x3 ) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1P),(float)(x2 ),(float)(x3 ) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1P),(float)(x2P),(float)(x3 ) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1 ),(float)(x2P),(float)(x3 ) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1 ),(float)(x2 ),(float)(x3P) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1P),(float)(x2 ),(float)(x3P) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1P),(float)(x2P),(float)(x3P) ) );
-				nodesVec[nodeCount++]=( makeUbTuple( (float)(x1 ),(float)(x2P),(float)(x3P) ) );
-
-				cellsVec.push_back( makeUbTuple(nodeCount-8,nodeCount-7,nodeCount-6,nodeCount-5,nodeCount-4,nodeCount-3,nodeCount-2,nodeCount-1) );
-
-			}
-			std::string filenameVec = para->getFName()+"_CellsCFF_"+StringUtil::toString<int>(level);
-			WbWriterVtkXmlBinary::getInstance()->writeOcts(filenameVec,nodesVec,cellsVec);
-		}
-	}
+
+//////////////////////////////////////////////////////////////////////////
+// Functions for version with streams
+//////////////////////////////////////////////////////////////////////////
+void checkForSendOrRecvNode(int pos, int &commDir, int &commDirectionInCommAfterFtoC, int& indexInCommVector,
+                            std::vector<ProcessNeighbor27> &sendRecvProcessNeighbor,
+                            std::vector<ProcessNeighbor27> &sendRecvProcessNeighborsAfterFtoC, double indicator)
+{
+    for (uint pn = 0; pn < (uint)sendRecvProcessNeighbor.size(); pn++) {
+        for (int j = 0; j < sendRecvProcessNeighbor[pn].numberOfNodes; j++) {
+            if (pos == sendRecvProcessNeighbor[pn].index[j]) {
+                commDir = indicator;
+                indexInCommVector = j;
+                if (j < sendRecvProcessNeighborsAfterFtoC[pn].numberOfNodes) {
+                    commDirectionInCommAfterFtoC = indicator;
+                }
+                return;
+            }
+        }
+    }
+}
+
+void checkForRecvNodeX(int pos, int &recvDir, int &recvDirectionInCommAfterFtoC, int& recvIndex, Parameter *para, int level)
+{
+    checkForSendOrRecvNode(pos, recvDir, recvDirectionInCommAfterFtoC, recvIndex, para->getParH(level)->recvProcessNeighborX,
+                           para->getParH(level)->recvProcessNeighborsAfterFtoCX, 2.0);
+}
+
+void checkForRecvNodeY(int pos, int &recvDir, int &recvDirectionInCommAfterFtoC, int& recvIndex, Parameter *para, int level)
+{
+    checkForSendOrRecvNode(pos, recvDir, recvDirectionInCommAfterFtoC, recvIndex, para->getParH(level)->recvProcessNeighborY,
+                           para->getParH(level)->recvProcessNeighborsAfterFtoCY, 4.0);
+}
+
+void checkForRecvNodeZ(int pos, int &recvDir, int &recvDirectionInCommAfterFtoC, int& recvIndex, Parameter *para, int level)
+{
+    checkForSendOrRecvNode(pos, recvDir, recvDirectionInCommAfterFtoC, recvIndex, para->getParH(level)->recvProcessNeighborZ,
+                           para->getParH(level)->recvProcessNeighborsAfterFtoCZ, 8.0);
+}
+
+void checkForSendNodeX(int pos, int &sendDir, int &sendDirectionInCommAfterFtoC, int& sendIndex, Parameter *para, int level)
+{
+    checkForSendOrRecvNode(pos, sendDir, sendDirectionInCommAfterFtoC, sendIndex, para->getParH(level)->sendProcessNeighborX,
+                           para->getParH(level)->sendProcessNeighborsAfterFtoCX, 2.0);
+}
+
+void checkForSendNodeY(int pos, int &sendDir, int &sendDirectionInCommAfterFtoC, int& sendIndex, Parameter *para, int level)
+{
+    checkForSendOrRecvNode(pos, sendDir, sendDirectionInCommAfterFtoC, sendIndex, para->getParH(level)->sendProcessNeighborY,
+                           para->getParH(level)->sendProcessNeighborsAfterFtoCY, 4.0);
+}
+
+void checkForSendNodeZ(int pos, int &sendDir, int &sendDirectionInCommAfterFtoC, int& sendIndex, Parameter *para, int level)
+{
+    checkForSendOrRecvNode(pos, sendDir, sendDirectionInCommAfterFtoC, sendIndex, para->getParH(level)->sendProcessNeighborZ,
+                           para->getParH(level)->sendProcessNeighborsAfterFtoCZ, 8.0);
+}
+
+void writeInterfaceFCC_Send(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    int nodeNumberVec = 0;
+
+    // nodedata
+    std::vector<std::string> datanames = { "sparse index", "borderBulk", "sendDirection",
+                                           "sendDirectionInCommAfterFtoC", "sendIndex" };
+    // sendDirection: x = 2, y = 4, z = 8
+    // borderBulk: border = 1, bulk = 0
+    std::vector<std::vector<double>> nodedata;
+
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        nodeNumberVec += (int)para->getParH(level)->intFC.kFC;
+    }
+
+    nodesVec.resize(nodeNumberVec);
+    nodedata.resize(datanames.size(), std::vector<double>(nodeNumberVec));
+
+    int nodeCount = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        for (unsigned int u = 0; u < para->getParH(level)->intFC.kFC; u++) {
+            int pos                = para->getParH(level)->intFC.ICellFCC[u];
+            nodedata[0][nodeCount] = pos;
+
+            // coordinate section
+            double x1           = para->getParH(level)->coordX_SP[pos];
+            double x2           = para->getParH(level)->coordY_SP[pos];
+            double x3           = para->getParH(level)->coordZ_SP[pos];
+            nodesVec[nodeCount] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+
+            // nodedata section
+            nodedata[1][nodeCount]           = u < para->getParH(level)->intFCBorder.kFC;
+            int sendDir                      = 0.0;
+            int sendDirectionInCommAfterFtoC = 0.0;
+            int sendIndex                    = 0.0;
+
+            checkForSendNodeX(pos, sendDir, sendIndex, sendDirectionInCommAfterFtoC, para, level);
+            checkForSendNodeY(pos, sendDir, sendIndex, sendDirectionInCommAfterFtoC, para, level);
+            checkForSendNodeZ(pos, sendDir, sendIndex, sendDirectionInCommAfterFtoC, para, level);
+            nodedata[2][nodeCount] = sendDir;
+            nodedata[3][nodeCount] = sendDirectionInCommAfterFtoC;
+            nodedata[4][nodeCount] = sendIndex;
+
+            nodeCount++;
+        }
+        std::string filenameVec = para->getFName() + "_writeInterfaceFCC_Send_PID_" +
+                                  std::to_string(vf::gpu::Communicator::getInstance().getPID()) + "_" +
+                                  StringUtil::toString<int>(level);
+
+        WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(filenameVec, nodesVec, datanames, nodedata);
+    }
+}
+
+void writeInterfaceCFC_Recv(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+    int nodeNumberVec = 0;
+
+    // nodedata
+    std::vector<std::string> datanames = { "sparse index", "borderBulk", "recvDirection",
+                                           "recvDirectionInCommAfterFtoC", "recvIndex"};
+    // recvDirection: x = 2, y = 4, z = 8
+    // borderBulk: border = 1, bulk = 0
+    std::vector<std::vector<double>> nodedata;
+
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        nodeNumberVec += (int)para->getParH(level)->intCF.kCF;
+    }
+
+    nodesVec.resize(nodeNumberVec);
+    nodedata.resize(datanames.size(), std::vector<double>(nodeNumberVec));
+
+    int nodeCount = 0;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        for (unsigned int u = 0; u < para->getParH(level)->intCF.kCF; u++) {
+            int pos                = para->getParH(level)->intCF.ICellCFC[u];
+            nodedata[0][nodeCount] = pos;
+
+            // coordinate section
+            double x1           = para->getParH(level)->coordX_SP[pos];
+            double x2           = para->getParH(level)->coordY_SP[pos];
+            double x3           = para->getParH(level)->coordZ_SP[pos];
+            nodesVec[nodeCount] = (makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+
+            // nodedata section
+            nodedata[1][nodeCount]           = u < para->getParH(level)->intCFBorder.kCF;
+            int recvDir                      = 0.0;
+            int recvDirectionInCommAfterFtoC = 0.0;
+            int recvIndex                    = 0.0;
+
+            checkForRecvNodeX(pos, recvDir, recvIndex, recvDirectionInCommAfterFtoC, para, level);
+            checkForRecvNodeY(pos, recvDir, recvIndex, recvDirectionInCommAfterFtoC, para, level);
+            checkForRecvNodeZ(pos, recvDir, recvIndex, recvDirectionInCommAfterFtoC, para, level);
+            nodedata[2][nodeCount] = recvDir;
+            nodedata[3][nodeCount] = recvDirectionInCommAfterFtoC;
+            nodedata[4][nodeCount] = recvIndex;
+            nodeCount++;
+        }
+        std::string filenameVec = para->getFName() + "_writeInterfaceCFC_Recv_PID_" +
+                                  std::to_string(vf::gpu::Communicator::getInstance().getPID()) + "_" +
+                                  StringUtil::toString<int>(level);
+
+        WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(filenameVec, nodesVec, datanames, nodedata);
+    }
+}
+
+void addToNodesVector(const int level, const int pos, std::vector<UbTupleFloat3> &nodesVec, Parameter *para)
+{
+    double x1 = para->getParH(level)->coordX_SP[pos];
+    double x2 = para->getParH(level)->coordY_SP[pos];
+    double x3 = para->getParH(level)->coordZ_SP[pos];
+    nodesVec.push_back(makeUbTuple((float)(x1), (float)(x2), (float)(x3)));
+}
+
+void writeSendNodesStream(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+
+    // nodedata
+    std::vector<std::string> datanames = { "sparse index", "sendDirection", "sendDirectionInCommAfterFtoC", "sendIndex",
+                                           "inICcellFCC" };
+    // sendDirection: x = 2, y = 4, z = 8
+    std::vector<std::vector<double>> nodedata;
+    nodedata.resize(datanames.size());
+
+    int pos;
+    int sendDirectionInCommAfterFtoC;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        // X
+        for (int pn = 0; pn < (int)para->getParH(level)->sendProcessNeighborX.size(); pn++) {
+            for (int i = 0; i < para->getParH(level)->sendProcessNeighborX[pn].numberOfNodes; i++) {
+                pos = para->getParH(level)->sendProcessNeighborX[pn].index[i];
+                nodedata[0].push_back(pos);
+                addToNodesVector(level, pos, nodesVec, para);
+
+                nodedata[1].push_back(2.0);
+                sendDirectionInCommAfterFtoC =
+                    (i < para->getParH(level)->sendProcessNeighborsAfterFtoCX[pn].numberOfNodes) ? 2.0 : 0.0;
+                nodedata[2].push_back(sendDirectionInCommAfterFtoC);
+                nodedata[3].push_back((double)i);
+            }
+        }
+
+        // Y
+        for (int pn = 0; pn < (int)para->getParH(level)->sendProcessNeighborY.size(); pn++) {
+            for (int i = 0; i < para->getParH(level)->sendProcessNeighborY[pn].numberOfNodes; i++) {
+                pos = para->getParH(level)->sendProcessNeighborY[pn].index[i];
+
+                sendDirectionInCommAfterFtoC =
+                    (i < para->getParH(level)->sendProcessNeighborsAfterFtoCY[pn].numberOfNodes) ? 4.0 : 0.0;
+
+                auto it = std::find(nodedata[0].begin(), nodedata[0].end(), pos);
+                if (it == nodedata[0].end()) {
+                    nodedata[0].push_back(pos);
+                    addToNodesVector(level, pos, nodesVec, para);
+                    nodedata[1].push_back(4.0);
+                    nodedata[2].push_back(sendDirectionInCommAfterFtoC);
+                    nodedata[3].push_back((double) i);
+                } else {
+                    int posInVectors = it - nodedata[0].begin();
+                    nodedata[1][posInVectors] += 4.0;
+                    nodedata[2][posInVectors] += sendDirectionInCommAfterFtoC;
+                    nodedata[3][posInVectors] = (double)i;
+                }
+            }
+        }
+
+        // Z
+        for (int pn = 0; pn < (int)para->getParH(level)->sendProcessNeighborZ.size(); pn++) {
+            for (int i = 0; i < para->getParH(level)->sendProcessNeighborZ[pn].numberOfNodes; i++) {
+                pos = para->getParH(level)->sendProcessNeighborZ[pn].index[i];
+
+                sendDirectionInCommAfterFtoC =
+                    (i < para->getParH(level)->sendProcessNeighborsAfterFtoCZ[pn].numberOfNodes) ? 8.0 : 0.0;
+
+                auto it = std::find(nodedata[0].begin(), nodedata[0].end(), pos);
+                if (it == nodedata[0].end()) {
+                    nodedata[0].push_back(pos);
+                    addToNodesVector(level, pos, nodesVec, para);
+                    nodedata[1].push_back(8.0);
+                    nodedata[2].push_back(sendDirectionInCommAfterFtoC);
+                    nodedata[3].push_back((double) i);
+                } else {
+                    int posInVectors = it - nodedata[0].begin();
+                    nodedata[1][posInVectors] += 8.0;
+                    nodedata[2][posInVectors] += sendDirectionInCommAfterFtoC;
+                    nodedata[3][posInVectors] = (double)i;
+                }
+            }
+        }
+
+        // check if node is in iCellFCC
+        nodedata[4].resize(nodedata[0].size());
+        for (int i = 0; i < (int)nodedata[0].size(); i++) {
+            pos = nodedata[0][i];
+            for (unsigned int u = 0; u < para->getParH(level)->intFC.kFC; u++) {
+                if (para->getParH(level)->intFC.ICellFCC[u] == (uint)pos) {
+                    nodedata[4][i] = 1.0;
+                    break;
+                }
+                nodedata[4][i] = 0.0;
+            }
+        }
+        std::string filenameVec = para->getFName() + "_writeSendNodesStreams_PID_" +
+                                  std::to_string(vf::gpu::Communicator::getInstance().getPID()) + "_" +
+                                  StringUtil::toString<int>(level);
+
+        WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(filenameVec, nodesVec, datanames, nodedata);
+    }
+}
+
+void writeRecvNodesStream(Parameter *para)
+{
+    std::vector<UbTupleFloat3> nodesVec;
+
+    // nodedata
+    std::vector<std::string> datanames = { "sparse index", "recvDirection", "recvDirectionInCommAfterFtoC", "recvIndex" };
+    // sendDirection: x = 2, y = 4, z = 8
+    std::vector<std::vector<double>> nodedata;
+    nodedata.resize(datanames.size());
+
+    int pos;
+    int recvDirectionInCommAfterFtoC;
+    for (int level = 0; level < para->getMaxLevel(); level++) {
+        // X
+        for (int pn = 0; pn < (int)para->getParH(level)->recvProcessNeighborX.size(); pn++) {
+            for (int i = 0; i < para->getParH(level)->recvProcessNeighborX[pn].numberOfNodes; i++) {
+                pos = para->getParH(level)->recvProcessNeighborX[pn].index[i];
+                nodedata[0].push_back(pos);
+                addToNodesVector(level, pos, nodesVec, para);
+
+                nodedata[1].push_back(2.0);
+                recvDirectionInCommAfterFtoC =
+                    (i < para->getParH(level)->recvProcessNeighborsAfterFtoCX[pn].numberOfNodes) ? 2.0 : 0.0;
+                nodedata[2].push_back(recvDirectionInCommAfterFtoC);
+                nodedata[3].push_back(i);
+            }
+        }
+
+        // Y
+        for (int pn = 0; pn < (int)para->getParH(level)->recvProcessNeighborY.size(); pn++) {
+            for (int i = 0; i < para->getParH(level)->recvProcessNeighborY[pn].numberOfNodes; i++) {
+                pos = para->getParH(level)->recvProcessNeighborY[pn].index[i];
+
+                recvDirectionInCommAfterFtoC =
+                    (i < para->getParH(level)->recvProcessNeighborsAfterFtoCY[pn].numberOfNodes) ? 4.0 : 0.0;
+
+                auto it = std::find(nodedata[0].begin(), nodedata[0].end(), pos);
+                if (it == nodedata[0].end()) {
+                    nodedata[0].push_back(pos);
+                    addToNodesVector(level, pos, nodesVec, para);
+                    nodedata[1].push_back(4.0);
+                    nodedata[2].push_back(recvDirectionInCommAfterFtoC);
+                    nodedata[3].push_back(i);
+                } else {
+                    int posInVectors = it - nodedata[0].begin();
+                    nodedata[1][posInVectors] += 4.0;
+                    nodedata[2][posInVectors] += recvDirectionInCommAfterFtoC;
+                    nodedata[3][posInVectors] += i;
+                }
+            }
+        }
+
+        // Z
+        for (int pn = 0; pn < (int)para->getParH(level)->recvProcessNeighborZ.size(); pn++) {
+            for (int i = 0; i < para->getParH(level)->recvProcessNeighborZ[pn].numberOfNodes; i++) {
+                pos = para->getParH(level)->recvProcessNeighborZ[pn].index[i];
+
+                recvDirectionInCommAfterFtoC =
+                    (i < para->getParH(level)->recvProcessNeighborsAfterFtoCZ[pn].numberOfNodes) ? 8.0 : 0.0;
+
+                auto it = std::find(nodedata[0].begin(), nodedata[0].end(), pos);
+                if (it == nodedata[0].end()) {
+                    nodedata[0].push_back(pos);
+                    addToNodesVector(level, pos, nodesVec, para);
+                    nodedata[1].push_back(8.0);
+                    nodedata[2].push_back(recvDirectionInCommAfterFtoC);
+                    nodedata[3].push_back(i);
+                } else {
+                    int posInVectors = it - nodedata[0].begin();
+                    nodedata[1][posInVectors] += 8.0;
+                    nodedata[2][posInVectors] += recvDirectionInCommAfterFtoC;
+                    nodedata[3][posInVectors] += i;
+                }
+            }
+        }
+
+        // Recv are nodes ghost nodes and therefore they can't be iCellCFCs
+
+        std::string filenameVec = para->getFName() + "_writeRecvNodesStreams_PID_" +
+                                  std::to_string(vf::gpu::Communicator::getInstance().getPID()) + "_" +
+                                  StringUtil::toString<int>(level);
+
+        WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(filenameVec, nodesVec, datanames, nodedata);
+    }
 }
+} // namespace InterfaceDebugWriter
 
 #endif
diff --git a/src/gpu/VirtualFluids_GPU/Output/Timer.cpp b/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
index 823364a22eca41517816c1fdb61dfdc96ef1d961..d501e20f691850fd743885436bf274141753254a 100644
--- a/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
+++ b/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
@@ -3,6 +3,7 @@
 #include <cuda_runtime.h>
 #include "UbScheduler.h"
 #include "Timer.h"
+#include "VirtualFluids_GPU/Communication/Communicator.h"
 
 
 void Timer::initTimer()
@@ -30,7 +31,7 @@ void Timer::resetTimer()
         this->totalElapsedTime = 0.0;
 }
 
-void Timer::outputPerformance(uint t, Parameter* para)
+void Timer::outputPerformance(uint t, Parameter* para, vf::gpu::Communicator& communicator)
 {
     real fnups      = 0.0;
     real bandwidth  = 0.0;
@@ -41,11 +42,18 @@ void Timer::outputPerformance(uint t, Parameter* para)
         bandwidth   += (27.0+1.0) * 4.0 * 1000.0 * (t-para->getTStart()) * para->getParH(lev)->size_Mat_SP  / (this->totalElapsedTime*1.0E9);
     }
 
-    if(this->firstOutput)
+    if(this->firstOutput && communicator.getPID() == 0) //only display the legend once
     {
-        VF_LOG_INFO(" --- {} --- Processing time (ms) \t Nups in Mio \t Bandwidth in GB/sec", this->name );
+        VF_LOG_INFO("PID \t --- {} ---  Processing time (ms) \t Nups in Mio \t Bandwidth in GB/sec", this->name );
         this->firstOutput = false;
     }
 
-    VF_LOG_INFO(" --- {} --- {}/{} \t {} \t {}", this->name, this->elapsedTime, this->totalElapsedTime, fnups, bandwidth  );
+    VF_LOG_INFO(" {} \t --- {} --- {:>8.1f}/ {:<8.1f} \t   {:5.1f} \t       {:4.1f}",  communicator.getPID(), this->name, this->elapsedTime, this->totalElapsedTime, fnups, bandwidth);
+
+    // When using multiple GPUs, sum the nups of all processes
+    if (communicator.getNummberOfProcess() > 1) {
+        double nupsSum =  communicator.sumNups(fnups);
+        if (communicator.getPID() == 0)
+            VF_LOG_INFO("Sum of all {} processes: Nups in Mio: {:.1f}", communicator.getNummberOfProcess(), nupsSum);
+    }
 }
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Output/Timer.h b/src/gpu/VirtualFluids_GPU/Output/Timer.h
index 6432b347458e68a5089aea3de625017d6facd34b..26be785c7f76b7695656c9600bdb586804dca251 100644
--- a/src/gpu/VirtualFluids_GPU/Output/Timer.h
+++ b/src/gpu/VirtualFluids_GPU/Output/Timer.h
@@ -9,6 +9,10 @@
 #include "logger/Logger.h"
 #include "Parameter/Parameter.h"
 
+namespace vf::gpu{
+    class Communicator;
+}
+
 class Timer
 {
     public:
@@ -27,7 +31,7 @@ class Timer
     void startTimer();
     void stopTimer();
     void resetTimer();
-    void outputPerformance(uint t, Parameter* para);
+    void outputPerformance(uint t, Parameter* para, vf::gpu::Communicator& communicator);
 
     float getElapsedTime(){ return this->elapsedTime; }
     float getTotalElapsedTime(){ return this->totalElapsedTime; }
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3731836f336d91c1bc4cc5f1a8f5ea0a10bee0a6
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.cpp
@@ -0,0 +1,72 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//=======================================================================================
+#include "CudaStreamManager.h"
+#include <helper_cuda.h>
+#include <iostream>
+
+void CudaStreamManager::launchStreams(uint numberOfStreams)
+{
+    cudaStreams.resize(numberOfStreams);
+    for (cudaStream_t &stream : cudaStreams)
+        cudaStreamCreate(&stream);
+}
+
+void CudaStreamManager::terminateStreams()
+{
+    for (cudaStream_t &stream : cudaStreams)
+        cudaStreamDestroy(stream);
+}
+
+cudaStream_t &CudaStreamManager::getStream(uint streamIndex)
+{ return cudaStreams[streamIndex]; }
+
+int CudaStreamManager::getBorderStreamIndex() { return borderStreamIndex; }
+
+int CudaStreamManager::getBulkStreamIndex() { return bulkStreamIndex; }
+
+void CudaStreamManager::createCudaEvents()
+{
+    checkCudaErrors(cudaEventCreateWithFlags(&startBulkKernel, cudaEventDisableTiming));
+}
+
+void CudaStreamManager::destroyCudaEvents() 
+{ 
+    checkCudaErrors(cudaEventDestroy(startBulkKernel)); 
+}
+
+void CudaStreamManager::triggerStartBulkKernel(int streamIndex)
+{
+    checkCudaErrors(cudaEventRecord(startBulkKernel, cudaStreams[streamIndex]));
+}
+
+void CudaStreamManager::waitOnStartBulkKernelEvent(int streamIndex)
+{
+    checkCudaErrors(cudaStreamWaitEvent(cudaStreams[streamIndex], startBulkKernel));
+}
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2d515ab5fe9c24388632a7ca9e1e4c78b7f1467
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Parameter/CudaStreamManager.h
@@ -0,0 +1,61 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//=======================================================================================
+#ifndef STREAM_MANAGER_H
+#define STREAM_MANAGER_H
+
+#include <vector>
+#include "Core/DataTypes.h"
+
+#include <cuda_runtime.h>
+
+class CudaStreamManager
+{
+private:
+    std::vector<cudaStream_t> cudaStreams;
+    cudaEvent_t startBulkKernel = NULL;
+    const int borderStreamIndex       = 1;
+    const int bulkStreamIndex         = 0;
+
+public:
+    void launchStreams(uint numberOfStreams);
+    void terminateStreams();
+    cudaStream_t &getStream(uint streamIndex);
+
+    int getBorderStreamIndex();
+    int getBulkStreamIndex();
+
+    // Events
+    void createCudaEvents();
+    void destroyCudaEvents();
+    void triggerStartBulkKernel(int streamIndex);
+    void waitOnStartBulkKernelEvent(int strteamIndex);
+};
+
+#endif
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
index c10b5b690bf8aa2c819b26acf1509f337debafe3..d63fb1d3f00172cba3fa8a15f449d6eb65199c8a 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
@@ -1,28 +1,28 @@
 //=======================================================================================
-// ____          ____    __    ______     __________   __      __       __        __         
-// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
-//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
-//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
-//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
-//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
-//      \    \  |    |   ________________________________________________________________    
-//       \    \ |    |  |  ______________________________________________________________|   
-//        \    \|    |  |  |         __          __     __     __     ______      _______    
-//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
-//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
 //           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
-//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
 //
-//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -32,9 +32,9 @@
 //=======================================================================================
 #include "Parameter.h"
 
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <math.h>
 
 #include <curand_kernel.h>
 
@@ -42,7 +42,7 @@
 
 #include <basics/config/ConfigurationFile.h>
 
-
+#include "Parameter/CudaStreamManager.h"
 
 Parameter::Parameter(const vf::basics::ConfigurationFile &configData, int numberOfProcesses, int myId)
 {
@@ -50,12 +50,14 @@ Parameter::Parameter(const vf::basics::ConfigurationFile &configData, int number
     ic.myid = myId;
 
     readConfigData(configData);
-    //initLBMSimulationParameter();
+    // initLBMSimulationParameter();
 }
 
+Parameter::~Parameter() = default;
+
 void Parameter::readConfigData(const vf::basics::ConfigurationFile &configData)
 {
-   if (configData.contains("NumberOfDevices"))
+    if (configData.contains("NumberOfDevices"))
         this->setMaxDev(configData.getValue<int>("NumberOfDevices"));
 
     //////////////////////////////////////////////////////////////////////////
@@ -109,13 +111,13 @@ void Parameter::readConfigData(const vf::basics::ConfigurationFile &configData)
     //////////////////////////////////////////////////////////////////////////
     if (configData.contains("UseMeasurePoints"))
         this->setUseMeasurePoints(configData.getValue<bool>("UseMeasurePoints"));
-	//////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     if (configData.contains("UseWale"))
         this->setUseWale(configData.getValue<bool>("UseWale"));
-	//////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     if (configData.contains("UseAMD"))
         this->setUseAMD(configData.getValue<bool>("UseAMD"));
-	//////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
     if (configData.contains("SGSconstant"))
         this->setSGSConstant(configData.getValue<real>("SGSconstant"));
     //////////////////////////////////////////////////////////////////////////
@@ -156,7 +158,7 @@ void Parameter::readConfigData(const vf::basics::ConfigurationFile &configData)
         this->setPressOutZ(configData.getValue<int>("PressOutZ"));
 
     //////////////////////////////////////////////////////////////////////////
-    //second component
+    // second component
     if (configData.contains("DiffOn"))
         this->setDiffOn(configData.getValue<bool>("DiffOn"));
     //////////////////////////////////////////////////////////////////////////
@@ -202,7 +204,21 @@ void Parameter::readConfigData(const vf::basics::ConfigurationFile &configData)
         this->setFactorPressBC(configData.getValue<real>("FactorPressBC"));
 
     //////////////////////////////////////////////////////////////////////////
-    //read Geometry (STL)
+    // CUDA streams and optimized communication
+    if (this->getNumprocs() > 1) {
+        if (configData.contains("useStreams")) {
+            if (configData.getValue<bool>("useStreams"))
+                this->setUseStreams(true);
+        }
+
+        if (configData.contains("useReducedCommunicationInInterpolation")) {
+            this->useReducedCommunicationAfterFtoC =
+                configData.getValue<bool>("useReducedCommunicationInInterpolation");
+        }
+    }
+    //////////////////////////////////////////////////////////////////////////
+
+    // read Geometry (STL)
     if (configData.contains("ReadGeometry"))
         this->setReadGeo(configData.getValue<bool>("ReadGeometry"));
 
@@ -438,2124 +454,2262 @@ void Parameter::readConfigData(const vf::basics::ConfigurationFile &configData)
 
 void Parameter::initLBMSimulationParameter()
 {
-	//host
-	for (int i = coarse; i <= fine; i++)
-	{
-		parH[i]                        = std::make_shared<LBMSimulationParameter>();
-		parH[i]->numberofthreads       = 64;// 128;
-		parH[i]->gridNX                = getGridX().at(i);
-		parH[i]->gridNY                = getGridY().at(i);
-		parH[i]->gridNZ                = getGridZ().at(i);
-		parH[i]->vis                   = ic.vis*pow(2.f,i);
-		parH[i]->diffusivity           = ic.Diffusivity*pow(2.f,i);
-		parH[i]->omega                 = 1.0f/(3.0f*parH[i]->vis+0.5f);//omega :-) not s9 = -1.0f/(3.0f*parH[i]->vis+0.5f);//
-		parH[i]->nx                    = parH[i]->gridNX + 2 * STARTOFFX;
-		parH[i]->ny                    = parH[i]->gridNY + 2 * STARTOFFY;
-		parH[i]->nz                    = parH[i]->gridNZ + 2 * STARTOFFZ;
-		parH[i]->size_Mat              = parH[i]->nx * parH[i]->ny * parH[i]->nz;
-		parH[i]->sizePlaneXY           = parH[i]->nx * parH[i]->ny;
-		parH[i]->sizePlaneYZ           = parH[i]->ny * parH[i]->nz;
-		parH[i]->sizePlaneXZ           = parH[i]->nx * parH[i]->nz;
-		parH[i]->mem_size_real         = sizeof(real     ) * parH[i]->size_Mat;
-		parH[i]->mem_size_int          = sizeof(unsigned int) * parH[i]->size_Mat;
-		parH[i]->mem_size_bool         = sizeof(bool        ) * parH[i]->size_Mat;
-		parH[i]->mem_size_real_yz      = sizeof(real     ) * parH[i]->ny * parH[i]->nz;
-		parH[i]->evenOrOdd             = true;
-		parH[i]->startz                = parH[i]->gridNZ * ic.myid;
-		parH[i]->endz                  = parH[i]->gridNZ * ic.myid + parH[i]->gridNZ;
-		parH[i]->Lx                    = (real)((1.f*parH[i]->gridNX - 1.f)/(pow(2.f,i)));
-		parH[i]->Ly                    = (real)((1.f*parH[i]->gridNY - 1.f)/(pow(2.f,i)));
-		parH[i]->Lz                    = (real)((1.f*parH[i]->gridNZ - 1.f)/(pow(2.f,i)));
-		parH[i]->dx                    = (real)(1.f/(pow(2.f,i)));
-		parH[i]->XdistKn               = getDistX().at(i);
-		parH[i]->YdistKn               = getDistY().at(i);
-		parH[i]->ZdistKn               = getDistZ().at(i);
-		if (i == coarse)
-		{
-			parH[i]->distX                 = (real)getDistX().at(i);
-			parH[i]->distY                 = (real)getDistY().at(i);
-			parH[i]->distZ                 = (real)getDistZ().at(i);
-			parH[i]->mTtoWx                = (real)1.0f;
-			parH[i]->mTtoWy                = (real)1.0f;
-			parH[i]->mTtoWz                = (real)1.0f;
-			parH[i]->cTtoWx                = (real)0.0f;
-			parH[i]->cTtoWy                = (real)0.0f;
-			parH[i]->cTtoWz                = (real)0.0f;
-			////MGs Trafo///////////////////////////////////////////////////////////////
-			//parH[i]->cStartx               = (real)parH[i]->XdistKn;
-			//parH[i]->cStarty               = (real)parH[i]->XdistKn;
-			//parH[i]->cStartz               = (real)parH[i]->XdistKn;
-			////////////////////////////////////////////////////////////////////////////
-		} 
-		else
-		{
-			//Geller
-			parH[i]->distX                 = ((real)getDistX().at(i) + 0.25f) * parH[i-1]->dx;
-			parH[i]->distY                 = ((real)getDistY().at(i) + 0.25f) * parH[i-1]->dx;
-			parH[i]->distZ                 = ((real)getDistZ().at(i) + 0.25f) * parH[i-1]->dx;
-			//parH[i]->distX                 = ((real)getDistX().at(i) + 0.25f) * parH[i-1]->dx + parH[i-1]->distX;
-			//parH[i]->distY                 = ((real)getDistY().at(i) + 0.25f) * parH[i-1]->dx + parH[i-1]->distY;
-			//parH[i]->distZ                 = ((real)getDistZ().at(i) + 0.25f) * parH[i-1]->dx + parH[i-1]->distZ;
-			parH[i]->mTtoWx                = (real)pow(0.5f,i);
-			parH[i]->mTtoWy                = (real)pow(0.5f,i);
-			parH[i]->mTtoWz                = (real)pow(0.5f,i);
-			parH[i]->cTtoWx                = (real)(STARTOFFX/2.f + (parH[i]->gridNX+1.f)/4.f); //funzt nur fuer zwei level
-			parH[i]->cTtoWy                = (real)(STARTOFFY/2.f + (parH[i]->gridNY+1.f)/4.f); //funzt nur fuer zwei level
-			parH[i]->cTtoWz                = (real)(STARTOFFZ/2.f + (parH[i]->gridNZ+1.f)/4.f); //funzt nur fuer zwei level
-			////MGs Trafo///////////////////////////////////////////////////////////////
-			//parH[i]->cStartx               = (real)parH[i]->XdistKn;
-			//parH[i]->cStarty               = (real)parH[i]->XdistKn;
-			//parH[i]->cStartz               = (real)parH[i]->XdistKn;
-			////////////////////////////////////////////////////////////////////////////
-		}
-	}
-
-	//device
-	for (int i = coarse; i <= fine; i++)
-	{
-		parD[i]                        = std::make_shared<LBMSimulationParameter>();
-		parD[i]->numberofthreads       = parH[i]->numberofthreads;
-		parD[i]->gridNX                = parH[i]->gridNX;
-		parD[i]->gridNY                = parH[i]->gridNY;
-		parD[i]->gridNZ                = parH[i]->gridNZ;
-		parD[i]->vis                   = parH[i]->vis;
-		parD[i]->diffusivity           = parH[i]->diffusivity;
-		parD[i]->omega                 = parH[i]->omega;
-		parD[i]->nx                    = parH[i]->nx;
-		parD[i]->ny                    = parH[i]->ny;
-		parD[i]->nz                    = parH[i]->nz;
-		parD[i]->size_Mat              = parH[i]->size_Mat;
-		parD[i]->sizePlaneXY           = parH[i]->sizePlaneXY;
-		parD[i]->sizePlaneYZ           = parH[i]->sizePlaneYZ;
-		parD[i]->sizePlaneXZ           = parH[i]->sizePlaneXZ;
-		parD[i]->mem_size_real         = sizeof(real     ) * parD[i]->size_Mat;
-		parD[i]->mem_size_int          = sizeof(unsigned int) * parD[i]->size_Mat;
-		parD[i]->mem_size_bool         = sizeof(bool        ) * parD[i]->size_Mat;
-		parD[i]->mem_size_real_yz      = sizeof(real     ) * parD[i]->ny * parD[i]->nz;
-		parD[i]->evenOrOdd             = parH[i]->evenOrOdd;
-		parD[i]->startz                = parH[i]->startz;
-		parD[i]->endz                  = parH[i]->endz;
-		parD[i]->Lx                    = parH[i]->Lx;
-		parD[i]->Ly                    = parH[i]->Ly;
-		parD[i]->Lz                    = parH[i]->Lz;
-		parD[i]->dx                    = parH[i]->dx;
-		parD[i]->XdistKn               = parH[i]->XdistKn;
-		parD[i]->YdistKn               = parH[i]->YdistKn;
-		parD[i]->ZdistKn               = parH[i]->ZdistKn;
-		parD[i]->distX                 = parH[i]->distX;
-		parD[i]->distY                 = parH[i]->distY;
-		parD[i]->distZ                 = parH[i]->distZ;
-	}
+    // host
+    for (int i = coarse; i <= fine; i++) {
+        parH[i]                   = std::make_shared<LBMSimulationParameter>();
+        parH[i]->numberofthreads  = 64; // 128;
+        parH[i]->gridNX           = getGridX().at(i);
+        parH[i]->gridNY           = getGridY().at(i);
+        parH[i]->gridNZ           = getGridZ().at(i);
+        parH[i]->vis              = ic.vis * pow(2.f, i);
+        parH[i]->diffusivity      = ic.Diffusivity * pow(2.f, i);
+        parH[i]->omega            = 1.0f / (3.0f * parH[i]->vis + 0.5f); // omega :-) not s9 = -1.0f/(3.0f*parH[i]->vis+0.5f);//
+        parH[i]->nx               = parH[i]->gridNX + 2 * STARTOFFX;
+        parH[i]->ny               = parH[i]->gridNY + 2 * STARTOFFY;
+        parH[i]->nz               = parH[i]->gridNZ + 2 * STARTOFFZ;
+        parH[i]->size_Mat         = parH[i]->nx * parH[i]->ny * parH[i]->nz;
+        parH[i]->sizePlaneXY      = parH[i]->nx * parH[i]->ny;
+        parH[i]->sizePlaneYZ      = parH[i]->ny * parH[i]->nz;
+        parH[i]->sizePlaneXZ      = parH[i]->nx * parH[i]->nz;
+        parH[i]->mem_size_real    = sizeof(real) * parH[i]->size_Mat;
+        parH[i]->mem_size_int     = sizeof(unsigned int) * parH[i]->size_Mat;
+        parH[i]->mem_size_bool    = sizeof(bool) * parH[i]->size_Mat;
+        parH[i]->mem_size_real_yz = sizeof(real) * parH[i]->ny * parH[i]->nz;
+        parH[i]->evenOrOdd        = true;
+        parH[i]->startz           = parH[i]->gridNZ * ic.myid;
+        parH[i]->endz             = parH[i]->gridNZ * ic.myid + parH[i]->gridNZ;
+        parH[i]->Lx               = (real)((1.f * parH[i]->gridNX - 1.f) / (pow(2.f, i)));
+        parH[i]->Ly               = (real)((1.f * parH[i]->gridNY - 1.f) / (pow(2.f, i)));
+        parH[i]->Lz               = (real)((1.f * parH[i]->gridNZ - 1.f) / (pow(2.f, i)));
+        parH[i]->dx               = (real)(1.f / (pow(2.f, i)));
+        parH[i]->XdistKn          = getDistX().at(i);
+        parH[i]->YdistKn          = getDistY().at(i);
+        parH[i]->ZdistKn          = getDistZ().at(i);
+        if (i == coarse) {
+            parH[i]->distX  = (real)getDistX().at(i);
+            parH[i]->distY  = (real)getDistY().at(i);
+            parH[i]->distZ  = (real)getDistZ().at(i);
+            parH[i]->mTtoWx = (real)1.0f;
+            parH[i]->mTtoWy = (real)1.0f;
+            parH[i]->mTtoWz = (real)1.0f;
+            parH[i]->cTtoWx = (real)0.0f;
+            parH[i]->cTtoWy = (real)0.0f;
+            parH[i]->cTtoWz = (real)0.0f;
+            ////MGs Trafo///////////////////////////////////////////////////////////////
+            // parH[i]->cStartx               = (real)parH[i]->XdistKn;
+            // parH[i]->cStarty               = (real)parH[i]->XdistKn;
+            // parH[i]->cStartz               = (real)parH[i]->XdistKn;
+            ////////////////////////////////////////////////////////////////////////////
+        } else {
+            // Geller
+            parH[i]->distX = ((real)getDistX().at(i) + 0.25f) * parH[i - 1]->dx;
+            parH[i]->distY = ((real)getDistY().at(i) + 0.25f) * parH[i - 1]->dx;
+            parH[i]->distZ = ((real)getDistZ().at(i) + 0.25f) * parH[i - 1]->dx;
+            // parH[i]->distX                 = ((real)getDistX().at(i) + 0.25f) * parH[i-1]->dx + parH[i-1]->distX;
+            // parH[i]->distY                 = ((real)getDistY().at(i) + 0.25f) * parH[i-1]->dx + parH[i-1]->distY;
+            // parH[i]->distZ                 = ((real)getDistZ().at(i) + 0.25f) * parH[i-1]->dx + parH[i-1]->distZ;
+            parH[i]->mTtoWx = (real)pow(0.5f, i);
+            parH[i]->mTtoWy = (real)pow(0.5f, i);
+            parH[i]->mTtoWz = (real)pow(0.5f, i);
+            parH[i]->cTtoWx = (real)(STARTOFFX / 2.f + (parH[i]->gridNX + 1.f) / 4.f); // funzt nur fuer zwei level
+            parH[i]->cTtoWy = (real)(STARTOFFY / 2.f + (parH[i]->gridNY + 1.f) / 4.f); // funzt nur fuer zwei level
+            parH[i]->cTtoWz = (real)(STARTOFFZ / 2.f + (parH[i]->gridNZ + 1.f) / 4.f); // funzt nur fuer zwei level
+            ////MGs Trafo///////////////////////////////////////////////////////////////
+            // parH[i]->cStartx               = (real)parH[i]->XdistKn;
+            // parH[i]->cStarty               = (real)parH[i]->XdistKn;
+            // parH[i]->cStartz               = (real)parH[i]->XdistKn;
+            ////////////////////////////////////////////////////////////////////////////
+        }
+    }
+
+    // device
+    for (int i = coarse; i <= fine; i++) {
+        parD[i]                   = std::make_shared<LBMSimulationParameter>();
+        parD[i]->numberofthreads  = parH[i]->numberofthreads;
+        parD[i]->gridNX           = parH[i]->gridNX;
+        parD[i]->gridNY           = parH[i]->gridNY;
+        parD[i]->gridNZ           = parH[i]->gridNZ;
+        parD[i]->vis              = parH[i]->vis;
+        parD[i]->diffusivity      = parH[i]->diffusivity;
+        parD[i]->omega            = parH[i]->omega;
+        parD[i]->nx               = parH[i]->nx;
+        parD[i]->ny               = parH[i]->ny;
+        parD[i]->nz               = parH[i]->nz;
+        parD[i]->size_Mat         = parH[i]->size_Mat;
+        parD[i]->sizePlaneXY      = parH[i]->sizePlaneXY;
+        parD[i]->sizePlaneYZ      = parH[i]->sizePlaneYZ;
+        parD[i]->sizePlaneXZ      = parH[i]->sizePlaneXZ;
+        parD[i]->mem_size_real    = sizeof(real) * parD[i]->size_Mat;
+        parD[i]->mem_size_int     = sizeof(unsigned int) * parD[i]->size_Mat;
+        parD[i]->mem_size_bool    = sizeof(bool) * parD[i]->size_Mat;
+        parD[i]->mem_size_real_yz = sizeof(real) * parD[i]->ny * parD[i]->nz;
+        parD[i]->evenOrOdd        = parH[i]->evenOrOdd;
+        parD[i]->startz           = parH[i]->startz;
+        parD[i]->endz             = parH[i]->endz;
+        parD[i]->Lx               = parH[i]->Lx;
+        parD[i]->Ly               = parH[i]->Ly;
+        parD[i]->Lz               = parH[i]->Lz;
+        parD[i]->dx               = parH[i]->dx;
+        parD[i]->XdistKn          = parH[i]->XdistKn;
+        parD[i]->YdistKn          = parH[i]->YdistKn;
+        parD[i]->ZdistKn          = parH[i]->ZdistKn;
+        parD[i]->distX            = parH[i]->distX;
+        parD[i]->distY            = parH[i]->distY;
+        parD[i]->distZ            = parH[i]->distZ;
+    }
 }
 
 void Parameter::copyMeasurePointsArrayToVector(int lev)
 {
-	int valuesPerClockCycle = (int)(getclockCycleForMP()/getTimestepForMP());
-	for(int i = 0; i < (int)parH[lev]->MP.size(); i++)
-	{
-		for(int j = 0; j < valuesPerClockCycle; j++)
-		{
-			int index = i*valuesPerClockCycle+j;
-			parH[lev]->MP[i].Vx.push_back(parH[lev]->VxMP[index]);
-			parH[lev]->MP[i].Vy.push_back(parH[lev]->VyMP[index]);
-			parH[lev]->MP[i].Vz.push_back(parH[lev]->VzMP[index]);
-			parH[lev]->MP[i].Rho.push_back(parH[lev]->RhoMP[index]);
-		}
-	}
+    int valuesPerClockCycle = (int)(getclockCycleForMP() / getTimestepForMP());
+    for (int i = 0; i < (int)parH[lev]->MP.size(); i++) {
+        for (int j = 0; j < valuesPerClockCycle; j++) {
+            int index = i * valuesPerClockCycle + j;
+            parH[lev]->MP[i].Vx.push_back(parH[lev]->VxMP[index]);
+            parH[lev]->MP[i].Vy.push_back(parH[lev]->VyMP[index]);
+            parH[lev]->MP[i].Vz.push_back(parH[lev]->VzMP[index]);
+            parH[lev]->MP[i].Rho.push_back(parH[lev]->RhoMP[index]);
+        }
+    }
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//set-methods
+// set-methods
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void Parameter::setForcing(real forcingX, real forcingY, real forcingZ)
 {
-	this->hostForcing[0] = forcingX;
-	this->hostForcing[1] = forcingY;
-	this->hostForcing[2] = forcingZ;
+    this->hostForcing[0] = forcingX;
+    this->hostForcing[1] = forcingY;
+    this->hostForcing[2] = forcingZ;
 }
 void Parameter::setQuadricLimiters(real quadricLimiterP, real quadricLimiterM, real quadricLimiterD)
-{	
-	this->hostQuadricLimiters[0] = quadricLimiterP;
-	this->hostQuadricLimiters[1] = quadricLimiterM;
-	this->hostQuadricLimiters[2] = quadricLimiterD;
+{
+    this->hostQuadricLimiters[0] = quadricLimiterP;
+    this->hostQuadricLimiters[1] = quadricLimiterM;
+    this->hostQuadricLimiters[2] = quadricLimiterD;
 }
-
 void Parameter::setPhi(real inPhi)
 {
-	Phi = inPhi;
+    Phi = inPhi;
 }
 void Parameter::setAngularVelocity(real inAngVel)
 {
-	angularVelocity = inAngVel;
+    angularVelocity = inAngVel;
 }
 void Parameter::setStepEnsight(unsigned int step)
 {
-	this->stepEnsight = step;
+    this->stepEnsight = step;
 }
 void Parameter::setOutputCount(unsigned int outputCount)
 {
-	this->outputCount = outputCount;
+    this->outputCount = outputCount;
 }
 void Parameter::setlimitOfNodesForVTK(unsigned int limitOfNodesForVTK)
 {
-	this->limitOfNodesForVTK = limitOfNodesForVTK;
+    this->limitOfNodesForVTK = limitOfNodesForVTK;
 }
 void Parameter::setStartTurn(unsigned int inStartTurn)
 {
-	startTurn = inStartTurn;
+    startTurn = inStartTurn;
 }
 void Parameter::setDiffOn(bool isDiff)
 {
-	diffOn = isDiff;
+    diffOn = isDiff;
 }
 void Parameter::setCompOn(bool isComp)
 {
-	compOn = isComp;
+    compOn = isComp;
 }
 void Parameter::setDiffMod(int DiffMod)
 {
-	diffMod = DiffMod;
+    diffMod = DiffMod;
 }
 void Parameter::setD3Qxx(int d3qxx)
 {
-	this->D3Qxx = d3qxx;
+    this->D3Qxx = d3qxx;
 }
 void Parameter::setMaxLevel(int maxlevel)
 {
-    this->maxlevel = maxlevel-1;
-    this->fine     = this->maxlevel;
+    this->maxlevel = maxlevel - 1;
+    this->fine = this->maxlevel;
     parH.resize(this->maxlevel + 1);
     parD.resize(this->maxlevel + 1);
 }
 void Parameter::setParticleBasicLevel(int pbl)
 {
-	this->particleBasicLevel = pbl;
+    this->particleBasicLevel = pbl;
 }
 void Parameter::setParticleInitLevel(int pil)
 {
-	this->particleInitLevel = pil;
+    this->particleInitLevel = pil;
 }
 void Parameter::setNumberOfParticles(int nop)
 {
-	this->numberOfParticles = nop;
+    this->numberOfParticles = nop;
 }
 void Parameter::setCalcParticles(bool calcParticles)
 {
-	this->calcParticles = calcParticles;
+    this->calcParticles = calcParticles;
 }
 void Parameter::setStartXHotWall(real startXHotWall)
 {
-	this->startXHotWall = startXHotWall;
+    this->startXHotWall = startXHotWall;
 }
 void Parameter::setEndXHotWall(real endXHotWall)
 {
-	this->endXHotWall = endXHotWall;
+    this->endXHotWall = endXHotWall;
 }
 void Parameter::setTEnd(unsigned int tend)
 {
-	ic.tend = tend;
+    ic.tend = tend;
 }
 void Parameter::setTOut(unsigned int tout)
 {
-	ic.tout = tout;
+    ic.tout = tout;
 }
 void Parameter::setTStartOut(unsigned int tStartOut)
 {
-	ic.tStartOut = tStartOut;
+    ic.tStartOut = tStartOut;
 }
 void Parameter::setTimestepOfCoarseLevel(unsigned int timestep)
 {
-	this->timestep = timestep;
+    this->timestep = timestep;
+}
+void Parameter::setCalcTurbulenceIntensity(bool calcVelocityAndFluctuations)
+{
+    this->calcVelocityAndFluctuations = calcVelocityAndFluctuations;
 }
 void Parameter::setCalcMedian(bool calcMedian)
 {
-	ic.calcMedian = calcMedian;
+    ic.calcMedian = calcMedian;
 }
 void Parameter::setCalcDragLift(bool calcDragLift)
 {
-	this->calcDragLift = calcDragLift;
+    this->calcDragLift = calcDragLift;
 }
 void Parameter::setCalcCp(bool calcCp)
 {
-	this->calcCp = calcCp;
+    this->calcCp = calcCp;
 }
 void Parameter::setWriteVeloASCIIfiles(bool writeVeloASCII)
 {
-	this->writeVeloASCII = writeVeloASCII;
+    this->writeVeloASCII = writeVeloASCII;
 }
 void Parameter::setCalcPlaneConc(bool calcPlaneConc)
 {
-	this->calcPlaneConc = calcPlaneConc;
+    this->calcPlaneConc = calcPlaneConc;
 }
 void Parameter::setTimeCalcMedStart(int CalcMedStart)
-{		
-	ic.tCalcMedStart = CalcMedStart;
+{
+    ic.tCalcMedStart = CalcMedStart;
 }
 void Parameter::setTimeCalcMedEnd(int CalcMedEnd)
 {
-	ic.tCalcMedEnd = CalcMedEnd;
+    ic.tCalcMedEnd = CalcMedEnd;
 }
 void Parameter::setOutputPath(std::string oPath)
 {
-	ic.oPath = oPath;
+    ic.oPath = oPath;
 }
 void Parameter::setOutputPrefix(std::string oPrefix)
 {
-	//std::string test = fname;
-	ic.oPrefix = oPrefix;
+    // std::string test = fname;
+    ic.oPrefix = oPrefix;
 }
 void Parameter::setFName(std::string fname)
 {
-	//std::string test = fname;
-	ic.fname = fname;
+    // std::string test = fname;
+    ic.fname = fname;
 }
 void Parameter::setPrintFiles(bool printfiles)
 {
-	ic.printFiles = printfiles;
+    ic.printFiles = printfiles;
 }
 void Parameter::setReadGeo(bool readGeo)
 {
-	ic.readGeo = readGeo;
+    ic.readGeo = readGeo;
 }
 void Parameter::setDiffusivity(real Diffusivity)
 {
-	ic.Diffusivity = Diffusivity;
+    ic.Diffusivity = Diffusivity;
 }
 void Parameter::setTemperatureInit(real Temp)
 {
-	ic.Temp = Temp;
+    ic.Temp = Temp;
 }
 void Parameter::setTemperatureBC(real TempBC)
 {
-	ic.TempBC = TempBC;
+    ic.TempBC = TempBC;
 }
 void Parameter::setViscosity(real Viscosity)
 {
-	ic.vis = Viscosity;
+    ic.vis = Viscosity;
 }
 void Parameter::setVelocity(real Velocity)
 {
-	ic.u0 = Velocity;
+    ic.u0 = Velocity;
 }
 void Parameter::setViscosityRatio(real ViscosityRatio)
 {
-	ic.vis_ratio = ViscosityRatio;
+    ic.vis_ratio = ViscosityRatio;
 }
 void Parameter::setVelocityRatio(real VelocityRatio)
 {
-	ic.u0_ratio = VelocityRatio;
+    ic.u0_ratio = VelocityRatio;
 }
 void Parameter::setDensityRatio(real DensityRatio)
 {
-	ic.delta_rho = DensityRatio;
+    ic.delta_rho = DensityRatio;
 }
 void Parameter::setPressRatio(real PressRatio)
 {
-	ic.delta_press = PressRatio;
+    ic.delta_press = PressRatio;
+}
+real Parameter::getTimeRatio()
+{
+    return this->getViscosityRatio() * pow(this->getVelocityRatio(), -2);
+}
+real Parameter::getForceRatio()
+{
+    return this->getDensityRatio() * pow(this->getViscosityRatio(), 2);
+}
+real Parameter::getLengthRatio()
+{
+    return this->getViscosityRatio() / this->getVelocityRatio();
 }
 void Parameter::setRealX(real RealX)
 {
-	ic.RealX = RealX;
+    ic.RealX = RealX;
 }
 void Parameter::setRealY(real RealY)
 {
-	ic.RealY = RealY;
+    ic.RealY = RealY;
 }
 void Parameter::setPressInID(unsigned int PressInID)
 {
-	ic.PressInID = PressInID;
+    ic.PressInID = PressInID;
 }
 void Parameter::setPressOutID(unsigned int PressOutID)
 {
-	ic.PressOutID = PressOutID;
+    ic.PressOutID = PressOutID;
 }
 void Parameter::setPressInZ(unsigned int PressInZ)
 {
-	ic.PressInZ = PressInZ;
+    ic.PressInZ = PressInZ;
 }
 void Parameter::setPressOutZ(unsigned int PressOutZ)
 {
-	ic.PressOutZ = PressOutZ;
+    ic.PressOutZ = PressOutZ;
 }
 void Parameter::setMaxDev(int maxdev)
 {
-	ic.maxdev = maxdev;
+    ic.maxdev = maxdev;
 }
 void Parameter::setMyID(int myid)
 {
-	ic.myid = myid;
+    ic.myid = myid;
 }
 void Parameter::setNumprocs(int numprocs)
 {
-	ic.numprocs = numprocs;
+    ic.numprocs = numprocs;
 }
 void Parameter::setDevices(std::vector<uint> devices)
 {
-	ic.devices = devices;
+    ic.devices = devices;
 }
 void Parameter::setGeometryFileC(std::string GeometryFileC)
 {
-	ic.geometryFileC = GeometryFileC;
+    ic.geometryFileC = GeometryFileC;
 }
 void Parameter::setGeometryFileM(std::string GeometryFileM)
 {
-	ic.geometryFileM = GeometryFileM;
+    ic.geometryFileM = GeometryFileM;
 }
 void Parameter::setGeometryFileF(std::string GeometryFileF)
 {
-	ic.geometryFileF = GeometryFileF;
+    ic.geometryFileF = GeometryFileF;
 }
 void Parameter::setRe(real Re)
 {
-	ic.Re = Re;
+    ic.Re = Re;
 }
 void Parameter::setFactorPressBC(real factorPressBC)
 {
-	ic.factorPressBC = factorPressBC;
+    ic.factorPressBC = factorPressBC;
 }
 void Parameter::setIsGeo(bool isGeo)
 {
-	ic.isGeo = isGeo;
+    ic.isGeo = isGeo;
 }
 void Parameter::setIsGeoNormal(bool isGeoNormal)
 {
-	ic.isGeoNormal = isGeoNormal;
+    ic.isGeoNormal = isGeoNormal;
 }
 void Parameter::setIsInflowNormal(bool isInflowNormal)
 {
-	ic.isInflowNormal = isInflowNormal;
+    ic.isInflowNormal = isInflowNormal;
 }
 void Parameter::setIsOutflowNormal(bool isOutflowNormal)
 {
-	ic.isOutflowNormal = isOutflowNormal;
+    ic.isOutflowNormal = isOutflowNormal;
 }
 void Parameter::setIsProp(bool isProp)
 {
-	ic.isProp = isProp;
+    ic.isProp = isProp;
 }
 void Parameter::setIsCp(bool isCp)
 {
-	ic.isCp = isCp;
+    ic.isCp = isCp;
 }
 void Parameter::setConcFile(bool concFile)
 {
-	ic.isConc = concFile;
+    ic.isConc = concFile;
 }
 void Parameter::setStreetVelocityFile(bool streetVelocityFile)
 {
-	ic.streetVelocityFile = streetVelocityFile;
+    ic.streetVelocityFile = streetVelocityFile;
 }
 void Parameter::setUseMeasurePoints(bool useMeasurePoints)
 {
-	ic.isMeasurePoints = useMeasurePoints;
+    ic.isMeasurePoints = useMeasurePoints;
+}
+void Parameter::setUseInitNeq(bool useInitNeq)
+{
+    ic.isInitNeq = useInitNeq;
+}
+void Parameter::setSimulatePorousMedia(bool simulatePorousMedia)
+{
+    ic.simulatePorousMedia = simulatePorousMedia;
 }
 void Parameter::setUseTurbulentViscosity(bool useTurbulentViscosity)
 {
-	ic.isTurbulentViscosity = useTurbulentViscosity;
+    ic.isTurbulentViscosity = useTurbulentViscosity;
 }
 void Parameter::setUseWale(bool useWale)
 {
-	ic.isWale = useWale;
-	if (useWale) setUseTurbulentViscosity(true);
+    ic.isWale = useWale;
+    if (useWale)
+        setUseTurbulentViscosity(true);
 }
-
 void Parameter::setUseAMD(bool useAMD)
 {
-	ic.isAMD = useAMD;
-	if (useAMD) setUseTurbulentViscosity(true);
-
+    ic.isAMD = useAMD;
+    if (useAMD)
+        setUseTurbulentViscosity(true);
 }
 void Parameter::setSGSConstant(real SGSConstant)
 {
-	ic.SGSConstant = SGSConstant;
+    ic.SGSConstant = SGSConstant;
 }
 void Parameter::setHasWallModelMonitor(bool hasWallModelMonitor)
 {
-	ic.hasWallModelMonitor = hasWallModelMonitor;
-}
-void Parameter::setUseInitNeq(bool useInitNeq)
-{
-	ic.isInitNeq = useInitNeq;
-}
-void Parameter::setSimulatePorousMedia(bool simulatePorousMedia)
-{
-	ic.simulatePorousMedia = simulatePorousMedia;
+    ic.hasWallModelMonitor = hasWallModelMonitor;
 }
 
 void Parameter::setIsF3(bool isF3)
 {
-	this->isF3 = isF3; 
+    this->isF3 = isF3;
 }
 
-void Parameter::setIsBodyForce(bool isBodyForce) 
+void Parameter::setIsBodyForce(bool isBodyForce)
 {
-	this->isBodyForce = isBodyForce;
+    this->isBodyForce = isBodyForce;
 }
 
 void Parameter::setGridX(std::vector<int> GridX)
 {
-	ic.GridX = GridX;
+    ic.GridX = GridX;
 }
 void Parameter::setGridY(std::vector<int> GridY)
 {
-	ic.GridY = GridY;
+    ic.GridY = GridY;
 }
 void Parameter::setGridZ(std::vector<int> GridZ)
 {
-	ic.GridZ = GridZ;
+    ic.GridZ = GridZ;
 }
 void Parameter::setDistX(std::vector<int> DistX)
 {
-	ic.DistX = DistX;
+    ic.DistX = DistX;
 }
 void Parameter::setDistY(std::vector<int> DistY)
 {
-	ic.DistY = DistY;
+    ic.DistY = DistY;
 }
 void Parameter::setDistZ(std::vector<int> DistZ)
 {
-	ic.DistZ = DistZ;
+    ic.DistZ = DistZ;
 }
 void Parameter::setScaleLBMtoSI(std::vector<real> scaleLBMtoSI)
 {
-	ic.scaleLBMtoSI = scaleLBMtoSI;
+    ic.scaleLBMtoSI = scaleLBMtoSI;
 }
 void Parameter::setTranslateLBMtoSI(std::vector<real> translateLBMtoSI)
 {
-	ic.translateLBMtoSI = translateLBMtoSI;
+    ic.translateLBMtoSI = translateLBMtoSI;
 }
 void Parameter::setMinCoordX(std::vector<real> MinCoordX)
 {
-	ic.minCoordX = MinCoordX;
+    ic.minCoordX = MinCoordX;
 }
 void Parameter::setMinCoordY(std::vector<real> MinCoordY)
 {
-	ic.minCoordY = MinCoordY;
+    ic.minCoordY = MinCoordY;
 }
 void Parameter::setMinCoordZ(std::vector<real> MinCoordZ)
 {
-	ic.minCoordZ = MinCoordZ;
+    ic.minCoordZ = MinCoordZ;
 }
 void Parameter::setMaxCoordX(std::vector<real> MaxCoordX)
 {
-	ic.maxCoordX = MaxCoordX;
+    ic.maxCoordX = MaxCoordX;
 }
 void Parameter::setMaxCoordY(std::vector<real> MaxCoordY)
 {
-	ic.maxCoordY = MaxCoordY;
+    ic.maxCoordY = MaxCoordY;
 }
 void Parameter::setMaxCoordZ(std::vector<real> MaxCoordZ)
 {
-	ic.maxCoordZ = MaxCoordZ;
+    ic.maxCoordZ = MaxCoordZ;
 }
-void Parameter::setTempH(TempforBoundaryConditions* TempH)
+void Parameter::setTempH(TempforBoundaryConditions *TempH)
 {
-	this->TempH = TempH;
+    this->TempH = TempH;
 }
-void Parameter::setTempD(TempforBoundaryConditions* TempD)
+void Parameter::setTempD(TempforBoundaryConditions *TempD)
 {
-	this->TempD = TempD;
+    this->TempD = TempD;
 }
-void Parameter::setTempVelH(TempVelforBoundaryConditions* TempVelH)
+void Parameter::setTempVelH(TempVelforBoundaryConditions *TempVelH)
 {
-	this->TempVelH = TempVelH;
+    this->TempVelH = TempVelH;
 }
-void Parameter::setTempVelD(TempVelforBoundaryConditions* TempVelD)
+void Parameter::setTempVelD(TempVelforBoundaryConditions *TempVelD)
 {
-	this->TempVelD = TempVelD;
+    this->TempVelD = TempVelD;
 }
-void Parameter::setTempPressH(TempPressforBoundaryConditions* TempPressH)
+void Parameter::setTempPressH(TempPressforBoundaryConditions *TempPressH)
 {
-	this->TempPressH = TempPressH;
+    this->TempPressH = TempPressH;
 }
-void Parameter::setTempPressD(TempPressforBoundaryConditions* TempPressD)
+void Parameter::setTempPressD(TempPressforBoundaryConditions *TempPressD)
 {
-	this->TempPressD = TempPressD;
+    this->TempPressD = TempPressD;
 }
-//void Parameter::setkInflowQ(unsigned int kInflowQ)
+// void Parameter::setkInflowQ(unsigned int kInflowQ)
 //{
 //   this->kInflowQ = kInflowQ;
 //}
-//void Parameter::setkOutflowQ(unsigned int kOutflowQ)
+// void Parameter::setkOutflowQ(unsigned int kOutflowQ)
 //{
 //   this->kOutflowQ = kOutflowQ;
 //}
-//void Parameter::setQinflowH(QforBoundaryConditions* QinflowH)
+// void Parameter::setQinflowH(QforBoundaryConditions* QinflowH)
 //{
 //   this->QinflowH = QinflowH;
 //}
-//void Parameter::setQinflowD(QforBoundaryConditions* QinflowD)
+// void Parameter::setQinflowD(QforBoundaryConditions* QinflowD)
 //{
 //   this->QinflowD = QinflowD;
 //}
-//void Parameter::setQoutflowH(QforBoundaryConditions* QoutflowH)
+// void Parameter::setQoutflowH(QforBoundaryConditions* QoutflowH)
 //{
 //   this->QoutflowH = QoutflowH;
 //}
-//void Parameter::setQoutflowD(QforBoundaryConditions* QoutflowD)
+// void Parameter::setQoutflowD(QforBoundaryConditions* QoutflowD)
 //{
 //   this->QoutflowD = QoutflowD;
 //}
 void Parameter::setkFull(std::string kFull)
 {
-	ic.kFull = kFull;
+    ic.kFull = kFull;
 }
 void Parameter::setgeoFull(std::string geoFull)
 {
-	ic.geoFull = geoFull;
+    ic.geoFull = geoFull;
 }
 void Parameter::setgeoVec(std::string geoVec)
 {
-	ic.geoVec = geoVec;
+    ic.geoVec = geoVec;
 }
 void Parameter::setcoordX(std::string coordX)
 {
-	ic.coordX = coordX;
+    ic.coordX = coordX;
 }
 void Parameter::setcoordY(std::string coordY)
 {
-	ic.coordY = coordY;
+    ic.coordY = coordY;
 }
 void Parameter::setcoordZ(std::string coordZ)
 {
-	ic.coordZ = coordZ;
+    ic.coordZ = coordZ;
 }
 void Parameter::setneighborX(std::string neighborX)
 {
-	ic.neighborX = neighborX;
+    ic.neighborX = neighborX;
 }
 void Parameter::setneighborY(std::string neighborY)
 {
-	ic.neighborY = neighborY;
+    ic.neighborY = neighborY;
 }
 void Parameter::setneighborZ(std::string neighborZ)
 {
-	ic.neighborZ = neighborZ;
+    ic.neighborZ = neighborZ;
 }
 void Parameter::setneighborWSB(std::string neighborWSB)
 {
-	ic.neighborWSB = neighborWSB;
+    ic.neighborWSB = neighborWSB;
 }
 void Parameter::setscaleCFC(std::string scaleCFC)
 {
-	ic.scaleCFC = scaleCFC;
+    ic.scaleCFC = scaleCFC;
 }
 void Parameter::setscaleCFF(std::string scaleCFF)
 {
-	ic.scaleCFF = scaleCFF;
+    ic.scaleCFF = scaleCFF;
 }
 void Parameter::setscaleFCC(std::string scaleFCC)
 {
-	ic.scaleFCC = scaleFCC;
+    ic.scaleFCC = scaleFCC;
 }
 void Parameter::setscaleFCF(std::string scaleFCF)
 {
-	ic.scaleFCF = scaleFCF;
+    ic.scaleFCF = scaleFCF;
 }
 void Parameter::setscaleOffsetCF(std::string scaleOffsetCF)
 {
-	ic.scaleOffsetCF = scaleOffsetCF;
+    ic.scaleOffsetCF = scaleOffsetCF;
 }
 void Parameter::setscaleOffsetFC(std::string scaleOffsetFC)
 {
-	ic.scaleOffsetFC = scaleOffsetFC;
+    ic.scaleOffsetFC = scaleOffsetFC;
 }
 void Parameter::setgeomBoundaryBcQs(std::string geomBoundaryBcQs)
 {
-	ic.geomBoundaryBcQs = geomBoundaryBcQs;
+    ic.geomBoundaryBcQs = geomBoundaryBcQs;
 }
 void Parameter::setgeomBoundaryBcValues(std::string geomBoundaryBcValues)
 {
-	ic.geomBoundaryBcValues = geomBoundaryBcValues;
+    ic.geomBoundaryBcValues = geomBoundaryBcValues;
 }
 void Parameter::setnoSlipBcPos(std::string noSlipBcPos)
 {
-	ic.noSlipBcPos = noSlipBcPos;
+    ic.noSlipBcPos = noSlipBcPos;
 }
 void Parameter::setnoSlipBcQs(std::string noSlipBcQs)
 {
-	ic.noSlipBcQs = noSlipBcQs;
+    ic.noSlipBcQs = noSlipBcQs;
 }
 void Parameter::setnoSlipBcValue(std::string noSlipBcValue)
 {
-	ic.noSlipBcValue = noSlipBcValue;
+    ic.noSlipBcValue = noSlipBcValue;
 }
 void Parameter::setnoSlipBcValues(std::string noSlipBcValues)
 {
-	ic.noSlipBcValues = noSlipBcValues;
+    ic.noSlipBcValues = noSlipBcValues;
 }
 void Parameter::setslipBcPos(std::string slipBcPos)
 {
-	ic.slipBcPos = slipBcPos;
+    ic.slipBcPos = slipBcPos;
 }
 void Parameter::setslipBcQs(std::string slipBcQs)
 {
-	ic.slipBcQs = slipBcQs;
+    ic.slipBcQs = slipBcQs;
 }
 void Parameter::setslipBcValue(std::string slipBcValue)
 {
-	ic.slipBcValue = slipBcValue;
+    ic.slipBcValue = slipBcValue;
 }
 void Parameter::setpressBcPos(std::string pressBcPos)
 {
-	ic.pressBcPos = pressBcPos;
+    ic.pressBcPos = pressBcPos;
 }
 void Parameter::setpressBcQs(std::string pressBcQs)
 {
-	ic.pressBcQs = pressBcQs;
+    ic.pressBcQs = pressBcQs;
 }
 void Parameter::setpressBcValue(std::string pressBcValue)
 {
-	ic.pressBcValue = pressBcValue;
+    ic.pressBcValue = pressBcValue;
 }
 void Parameter::setpressBcValues(std::string pressBcValues)
 {
-	ic.pressBcValues = pressBcValues;
+    ic.pressBcValues = pressBcValues;
 }
 void Parameter::setvelBcQs(std::string velBcQs)
 {
-	ic.velBcQs = velBcQs;
+    ic.velBcQs = velBcQs;
 }
 void Parameter::setvelBcValues(std::string velBcValues)
 {
-	ic.velBcValues = velBcValues;
+    ic.velBcValues = velBcValues;
 }
 void Parameter::setinletBcQs(std::string inletBcQs)
 {
-	ic.inletBcQs = inletBcQs;
+    ic.inletBcQs = inletBcQs;
 }
 void Parameter::setinletBcValues(std::string inletBcValues)
 {
-	ic.inletBcValues = inletBcValues;
+    ic.inletBcValues = inletBcValues;
 }
 void Parameter::setoutletBcQs(std::string outletBcQs)
 {
-	ic.outletBcQs = outletBcQs;
+    ic.outletBcQs = outletBcQs;
 }
 void Parameter::setoutletBcValues(std::string outletBcValues)
 {
-	ic.outletBcValues = outletBcValues;
+    ic.outletBcValues = outletBcValues;
 }
 void Parameter::settopBcQs(std::string topBcQs)
 {
-	ic.topBcQs = topBcQs;
+    ic.topBcQs = topBcQs;
 }
 void Parameter::settopBcValues(std::string topBcValues)
 {
-	ic.topBcValues = topBcValues;
+    ic.topBcValues = topBcValues;
 }
 void Parameter::setbottomBcQs(std::string bottomBcQs)
 {
-	ic.bottomBcQs = bottomBcQs;
+    ic.bottomBcQs = bottomBcQs;
 }
 void Parameter::setbottomBcValues(std::string bottomBcValues)
 {
-	ic.bottomBcValues = bottomBcValues;
+    ic.bottomBcValues = bottomBcValues;
 }
 void Parameter::setfrontBcQs(std::string frontBcQs)
 {
-	ic.frontBcQs = frontBcQs;
+    ic.frontBcQs = frontBcQs;
 }
 void Parameter::setfrontBcValues(std::string frontBcValues)
 {
-	ic.frontBcValues = frontBcValues;
+    ic.frontBcValues = frontBcValues;
 }
 void Parameter::setbackBcQs(std::string backBcQs)
 {
-	ic.backBcQs = backBcQs;
+    ic.backBcQs = backBcQs;
 }
 void Parameter::setbackBcValues(std::string backBcValues)
 {
-	ic.backBcValues = backBcValues;
+    ic.backBcValues = backBcValues;
 }
 void Parameter::setwallBcQs(std::string wallBcQs)
 {
-	ic.wallBcQs = wallBcQs;
+    ic.wallBcQs = wallBcQs;
 }
 void Parameter::setwallBcValues(std::string wallBcValues)
 {
-	ic.wallBcValues = wallBcValues;
+    ic.wallBcValues = wallBcValues;
 }
 void Parameter::setperiodicBcQs(std::string periodicBcQs)
 {
-	ic.periodicBcQs = periodicBcQs;
+    ic.periodicBcQs = periodicBcQs;
 }
 void Parameter::setperiodicBcValues(std::string periodicBcValues)
 {
-	ic.periodicBcValues = periodicBcValues;
+    ic.periodicBcValues = periodicBcValues;
 }
 void Parameter::setpropellerQs(std::string propellerQs)
 {
-	ic.propellerQs = propellerQs;
+    ic.propellerQs = propellerQs;
 }
 void Parameter::setpropellerValues(std::string propellerValues)
 {
-	ic.propellerValues = propellerValues;
+    ic.propellerValues = propellerValues;
 }
 void Parameter::setpropellerCylinder(std::string propellerCylinder)
 {
-	ic.propellerCylinder = propellerCylinder;
+    ic.propellerCylinder = propellerCylinder;
 }
 void Parameter::setmeasurePoints(std::string measurePoints)
 {
-	ic.measurePoints = measurePoints;
+    ic.measurePoints = measurePoints;
 }
 void Parameter::setnumberNodes(std::string numberNodes)
 {
-	ic.numberNodes = numberNodes;
+    ic.numberNodes = numberNodes;
 }
 void Parameter::setLBMvsSI(std::string LBMvsSI)
 {
-	ic.LBMvsSI = LBMvsSI;
+    ic.LBMvsSI = LBMvsSI;
 }
 void Parameter::setcpTop(std::string cpTop)
 {
-	ic.cpTop = cpTop;
+    ic.cpTop = cpTop;
 }
 void Parameter::setcpBottom(std::string cpBottom)
 {
-	ic.cpBottom = cpBottom;
+    ic.cpBottom = cpBottom;
 }
 void Parameter::setcpBottom2(std::string cpBottom2)
 {
-	ic.cpBottom2 = cpBottom2;
+    ic.cpBottom2 = cpBottom2;
 }
 void Parameter::setConcentration(std::string concFile)
 {
-	ic.concentration = concFile;
+    ic.concentration = concFile;
 }
 void Parameter::setStreetVelocity(std::string streetVelocity)
 {
-	ic.streetVelocity = streetVelocity;
+    ic.streetVelocity = streetVelocity;
 }
 void Parameter::setclockCycleForMP(real clockCycleForMP)
 {
-	ic.clockCycleForMP = clockCycleForMP;
+    ic.clockCycleForMP = clockCycleForMP;
 }
 void Parameter::setTimeDoCheckPoint(unsigned int tDoCheckPoint)
 {
-	ic.tDoCheckPoint = tDoCheckPoint;
+    ic.tDoCheckPoint = tDoCheckPoint;
 }
 void Parameter::setTimeDoRestart(unsigned int tDoRestart)
 {
-	ic.tDoRestart = tDoRestart;
+    ic.tDoRestart = tDoRestart;
 }
 void Parameter::setDoCheckPoint(bool doCheckPoint)
 {
-	ic.doCheckPoint = doCheckPoint;
+    ic.doCheckPoint = doCheckPoint;
 }
 void Parameter::setDoRestart(bool doRestart)
 {
-	ic.doRestart = doRestart;
+    ic.doRestart = doRestart;
 }
 void Parameter::settimestepForMP(unsigned int timestepForMP)
 {
-	ic.timeStepForMP = timestepForMP;
+    ic.timeStepForMP = timestepForMP;
 }
 void Parameter::setObj(std::string str, bool isObj)
 {
-	if (str == "geo")
-	{
-		this->setIsGeo(isObj);
-	}
-	else if (str == "prop")
-	{
-		this->setIsProp(isObj);
-	}
-	else if (str == "cp")
-	{
-		this->setIsCp(isObj);
-	}
-	else if (str == "geoNormal")
-	{
-		this->setIsGeoNormal(isObj);
-	}
-	else if (str == "inflowNormal")
-	{
-		this->setIsInflowNormal(isObj);
-	}
-	else if (str == "outflowNormal")
-	{
-		this->setIsOutflowNormal(isObj);
-	}
+    if (str == "geo") {
+        this->setIsGeo(isObj);
+    } else if (str == "prop") {
+        this->setIsProp(isObj);
+    } else if (str == "cp") {
+        this->setIsCp(isObj);
+    } else if (str == "geoNormal") {
+        this->setIsGeoNormal(isObj);
+    } else if (str == "inflowNormal") {
+        this->setIsInflowNormal(isObj);
+    } else if (str == "outflowNormal") {
+        this->setIsOutflowNormal(isObj);
+    }
 }
 void Parameter::setGeometryValues(bool GeometryValues)
 {
-	ic.GeometryValues = GeometryValues;
+    ic.GeometryValues = GeometryValues;
 }
 void Parameter::setCalc2ndOrderMoments(bool is2ndOrderMoments)
 {
-	ic.is2ndOrderMoments = is2ndOrderMoments;
+    ic.is2ndOrderMoments = is2ndOrderMoments;
 }
 void Parameter::setCalc3rdOrderMoments(bool is3rdOrderMoments)
 {
-	ic.is3rdOrderMoments = is3rdOrderMoments;
+    ic.is3rdOrderMoments = is3rdOrderMoments;
 }
 void Parameter::setCalcHighOrderMoments(bool isHighOrderMoments)
 {
-	ic.isHighOrderMoments = isHighOrderMoments;
+    ic.isHighOrderMoments = isHighOrderMoments;
 }
 void Parameter::setMemsizeGPU(double admem, bool reset)
 {
-	if (reset == true)
-	{
-		this->memsizeGPU = 0.;
-	} 
-	else
-	{
-		this->memsizeGPU += admem;
-	}
+    if (reset == true) {
+        this->memsizeGPU = 0.;
+    } else {
+        this->memsizeGPU += admem;
+    }
 }
-//1D domain decomposition
+// 1D domain decomposition
 void Parameter::setPossNeighborFiles(std::vector<std::string> possNeighborFiles, std::string sor)
 {
-	if (sor=="send")
-	{
-		this->possNeighborFilesSend = possNeighborFiles;
-	} 
-	else if (sor == "recv")
-	{
-		this->possNeighborFilesRecv = possNeighborFiles;
-	}
+    if (sor == "send") {
+        this->possNeighborFilesSend = possNeighborFiles;
+    } else if (sor == "recv") {
+        this->possNeighborFilesRecv = possNeighborFiles;
+    }
 }
 void Parameter::setNumberOfProcessNeighbors(unsigned int numberOfProcessNeighbors, int level, std::string sor)
 {
-	if (sor=="send")
-	{
-		parH[level]->sendProcessNeighbor.resize(numberOfProcessNeighbors);
-		parD[level]->sendProcessNeighbor.resize(numberOfProcessNeighbors);
-	} 
-	else if (sor == "recv")
-	{
-		parH[level]->recvProcessNeighbor.resize(numberOfProcessNeighbors);
-		parD[level]->recvProcessNeighbor.resize(numberOfProcessNeighbors);
-	}
+    if (sor == "send") {
+        parH[level]->sendProcessNeighbor.resize(numberOfProcessNeighbors);
+        parD[level]->sendProcessNeighbor.resize(numberOfProcessNeighbors);
+    } else if (sor == "recv") {
+        parH[level]->recvProcessNeighbor.resize(numberOfProcessNeighbors);
+        parD[level]->recvProcessNeighbor.resize(numberOfProcessNeighbors);
+    }
 }
 void Parameter::setIsNeighbor(bool isNeigbor)
 {
-	this->isNeigbor = isNeigbor;
+    this->isNeigbor = isNeigbor;
 }
-//3D domain decomposition
+// 3D domain decomposition
 void Parameter::setPossNeighborFilesX(std::vector<std::string> possNeighborFiles, std::string sor)
 {
-	if (sor=="send")
-	{
-		this->possNeighborFilesSendX = possNeighborFiles;
-	} 
-	else if (sor == "recv")
-	{
-		this->possNeighborFilesRecvX = possNeighborFiles;
-	}
+    if (sor == "send") {
+        this->possNeighborFilesSendX = possNeighborFiles;
+    } else if (sor == "recv") {
+        this->possNeighborFilesRecvX = possNeighborFiles;
+    }
 }
 void Parameter::setPossNeighborFilesY(std::vector<std::string> possNeighborFiles, std::string sor)
 {
-	if (sor=="send")
-	{
-		this->possNeighborFilesSendY = possNeighborFiles;
-	} 
-	else if (sor == "recv")
-	{
-		this->possNeighborFilesRecvY = possNeighborFiles;
-	}
+    if (sor == "send") {
+        this->possNeighborFilesSendY = possNeighborFiles;
+    } else if (sor == "recv") {
+        this->possNeighborFilesRecvY = possNeighborFiles;
+    }
 }
 void Parameter::setPossNeighborFilesZ(std::vector<std::string> possNeighborFiles, std::string sor)
 {
-	if (sor=="send")
-	{
-		this->possNeighborFilesSendZ = possNeighborFiles;
-	} 
-	else if (sor == "recv")
-	{
-		this->possNeighborFilesRecvZ = possNeighborFiles;
-	}
+    if (sor == "send") {
+        this->possNeighborFilesSendZ = possNeighborFiles;
+    } else if (sor == "recv") {
+        this->possNeighborFilesRecvZ = possNeighborFiles;
+    }
 }
 void Parameter::setNumberOfProcessNeighborsX(unsigned int numberOfProcessNeighbors, int level, std::string sor)
 {
-	if (sor=="send")
-	{
-		parH[level]->sendProcessNeighborX.resize(numberOfProcessNeighbors);
-		parD[level]->sendProcessNeighborX.resize(numberOfProcessNeighbors);
-		//////////////////////////////////////////////////////////////////////////
-		if (getDiffOn()==true){
-			parH[level]->sendProcessNeighborADX.resize(numberOfProcessNeighbors);
-			parD[level]->sendProcessNeighborADX.resize(numberOfProcessNeighbors);
-		}
-		//////////////////////////////////////////////////////////////////////////
-	} 
-	else if (sor == "recv")
-	{
-		parH[level]->recvProcessNeighborX.resize(numberOfProcessNeighbors);
-		parD[level]->recvProcessNeighborX.resize(numberOfProcessNeighbors);
-		//////////////////////////////////////////////////////////////////////////
-		if (getDiffOn()==true){
-			parH[level]->recvProcessNeighborADX.resize(numberOfProcessNeighbors);
-			parD[level]->recvProcessNeighborADX.resize(numberOfProcessNeighbors);
-		}
-		//////////////////////////////////////////////////////////////////////////
-	}
+    if (sor == "send") {
+        parH[level]->sendProcessNeighborX.resize(numberOfProcessNeighbors);
+        parD[level]->sendProcessNeighborX.resize(numberOfProcessNeighbors);
+        //////////////////////////////////////////////////////////////////////////
+        if (getDiffOn() == true) {
+            parH[level]->sendProcessNeighborADX.resize(numberOfProcessNeighbors);
+            parD[level]->sendProcessNeighborADX.resize(numberOfProcessNeighbors);
+        }
+        //////////////////////////////////////////////////////////////////////////
+    } else if (sor == "recv") {
+        parH[level]->recvProcessNeighborX.resize(numberOfProcessNeighbors);
+        parD[level]->recvProcessNeighborX.resize(numberOfProcessNeighbors);
+        //////////////////////////////////////////////////////////////////////////
+        if (getDiffOn() == true) {
+            parH[level]->recvProcessNeighborADX.resize(numberOfProcessNeighbors);
+            parD[level]->recvProcessNeighborADX.resize(numberOfProcessNeighbors);
+        }
+        //////////////////////////////////////////////////////////////////////////
+    }
 }
 void Parameter::setNumberOfProcessNeighborsY(unsigned int numberOfProcessNeighbors, int level, std::string sor)
 {
-	if (sor=="send")
-	{
-		parH[level]->sendProcessNeighborY.resize(numberOfProcessNeighbors);
-		parD[level]->sendProcessNeighborY.resize(numberOfProcessNeighbors);
-		//////////////////////////////////////////////////////////////////////////
-		if (getDiffOn()==true){
-			parH[level]->sendProcessNeighborADY.resize(numberOfProcessNeighbors);
-			parD[level]->sendProcessNeighborADY.resize(numberOfProcessNeighbors);
-		}
-		//////////////////////////////////////////////////////////////////////////
-	} 
-	else if (sor == "recv")
-	{
-		parH[level]->recvProcessNeighborY.resize(numberOfProcessNeighbors);
-		parD[level]->recvProcessNeighborY.resize(numberOfProcessNeighbors);
-		//////////////////////////////////////////////////////////////////////////
-		if (getDiffOn()==true){
-			parH[level]->recvProcessNeighborADY.resize(numberOfProcessNeighbors);
-			parD[level]->recvProcessNeighborADY.resize(numberOfProcessNeighbors);
-		}
-		//////////////////////////////////////////////////////////////////////////
-	}
+    if (sor == "send") {
+        parH[level]->sendProcessNeighborY.resize(numberOfProcessNeighbors);
+        parD[level]->sendProcessNeighborY.resize(numberOfProcessNeighbors);
+        //////////////////////////////////////////////////////////////////////////
+        if (getDiffOn() == true) {
+            parH[level]->sendProcessNeighborADY.resize(numberOfProcessNeighbors);
+            parD[level]->sendProcessNeighborADY.resize(numberOfProcessNeighbors);
+        }
+        //////////////////////////////////////////////////////////////////////////
+    } else if (sor == "recv") {
+        parH[level]->recvProcessNeighborY.resize(numberOfProcessNeighbors);
+        parD[level]->recvProcessNeighborY.resize(numberOfProcessNeighbors);
+        //////////////////////////////////////////////////////////////////////////
+        if (getDiffOn() == true) {
+            parH[level]->recvProcessNeighborADY.resize(numberOfProcessNeighbors);
+            parD[level]->recvProcessNeighborADY.resize(numberOfProcessNeighbors);
+        }
+        //////////////////////////////////////////////////////////////////////////
+    }
 }
 void Parameter::setNumberOfProcessNeighborsZ(unsigned int numberOfProcessNeighbors, int level, std::string sor)
 {
-	if (sor=="send")
-	{
-		parH[level]->sendProcessNeighborZ.resize(numberOfProcessNeighbors);
-		parD[level]->sendProcessNeighborZ.resize(numberOfProcessNeighbors);
-		//////////////////////////////////////////////////////////////////////////
-		if (getDiffOn()==true){
-			parH[level]->sendProcessNeighborADZ.resize(numberOfProcessNeighbors);
-			parD[level]->sendProcessNeighborADZ.resize(numberOfProcessNeighbors);
-		}
-		//////////////////////////////////////////////////////////////////////////
-	} 
-	else if (sor == "recv")
-	{
-		parH[level]->recvProcessNeighborZ.resize(numberOfProcessNeighbors);
-		parD[level]->recvProcessNeighborZ.resize(numberOfProcessNeighbors);
-		//////////////////////////////////////////////////////////////////////////
-		if (getDiffOn()==true){
-			parH[level]->recvProcessNeighborADZ.resize(numberOfProcessNeighbors);
-			parD[level]->recvProcessNeighborADZ.resize(numberOfProcessNeighbors);
-		}
-		//////////////////////////////////////////////////////////////////////////
-	}
+    if (sor == "send") {
+        parH[level]->sendProcessNeighborZ.resize(numberOfProcessNeighbors);
+        parD[level]->sendProcessNeighborZ.resize(numberOfProcessNeighbors);
+        //////////////////////////////////////////////////////////////////////////
+        if (getDiffOn() == true) {
+            parH[level]->sendProcessNeighborADZ.resize(numberOfProcessNeighbors);
+            parD[level]->sendProcessNeighborADZ.resize(numberOfProcessNeighbors);
+        }
+        //////////////////////////////////////////////////////////////////////////
+    } else if (sor == "recv") {
+        parH[level]->recvProcessNeighborZ.resize(numberOfProcessNeighbors);
+        parD[level]->recvProcessNeighborZ.resize(numberOfProcessNeighbors);
+        //////////////////////////////////////////////////////////////////////////
+        if (getDiffOn() == true) {
+            parH[level]->recvProcessNeighborADZ.resize(numberOfProcessNeighbors);
+            parD[level]->recvProcessNeighborADZ.resize(numberOfProcessNeighbors);
+        }
+        //////////////////////////////////////////////////////////////////////////
+    }
 }
 void Parameter::setIsNeighborX(bool isNeigbor)
 {
-	this->isNeigborX = isNeigbor;
+    this->isNeigborX = isNeigbor;
 }
 void Parameter::setIsNeighborY(bool isNeigbor)
 {
-	this->isNeigborY = isNeigbor;
+    this->isNeigborY = isNeigbor;
 }
 void Parameter::setIsNeighborZ(bool isNeigbor)
 {
-	this->isNeigborZ = isNeigbor;
+    this->isNeigborZ = isNeigbor;
+}
+void Parameter::setSendProcessNeighborsAfterFtoCX(int numberOfNodes, int level, int arrayIndex)
+{
+    this->getParH(level)->sendProcessNeighborsAfterFtoCX[arrayIndex].numberOfNodes = numberOfNodes;
+    this->getParD(level)->sendProcessNeighborsAfterFtoCX[arrayIndex].numberOfNodes = numberOfNodes;
+    this->getParH(level)->sendProcessNeighborsAfterFtoCX[arrayIndex].memsizeFs     = sizeof(real) * numberOfNodes;
+    this->getParD(level)->sendProcessNeighborsAfterFtoCX[arrayIndex].memsizeFs     = sizeof(real) * numberOfNodes;
+    this->getParH(level)->sendProcessNeighborsAfterFtoCX[arrayIndex].numberOfFs    = this->D3Qxx * numberOfNodes;
+    this->getParD(level)->sendProcessNeighborsAfterFtoCX[arrayIndex].numberOfFs    = this->D3Qxx * numberOfNodes;
+}
+void Parameter::setSendProcessNeighborsAfterFtoCY(int numberOfNodes, int level, int arrayIndex)
+{
+    this->getParH(level)->sendProcessNeighborsAfterFtoCY[arrayIndex].numberOfNodes = numberOfNodes;
+    this->getParD(level)->sendProcessNeighborsAfterFtoCY[arrayIndex].numberOfNodes = numberOfNodes;
+    this->getParH(level)->sendProcessNeighborsAfterFtoCY[arrayIndex].memsizeFs     = sizeof(real) * numberOfNodes;
+    this->getParD(level)->sendProcessNeighborsAfterFtoCY[arrayIndex].memsizeFs     = sizeof(real) * numberOfNodes;
+    this->getParH(level)->sendProcessNeighborsAfterFtoCY[arrayIndex].numberOfFs    = this->D3Qxx * numberOfNodes;
+    this->getParD(level)->sendProcessNeighborsAfterFtoCY[arrayIndex].numberOfFs    = this->D3Qxx * numberOfNodes;
+}
+void Parameter::setSendProcessNeighborsAfterFtoCZ(int numberOfNodes, int level, int arrayIndex)
+{
+    this->getParH(level)->sendProcessNeighborsAfterFtoCZ[arrayIndex].numberOfNodes = numberOfNodes;
+    this->getParD(level)->sendProcessNeighborsAfterFtoCZ[arrayIndex].numberOfNodes = numberOfNodes;
+    this->getParH(level)->sendProcessNeighborsAfterFtoCZ[arrayIndex].memsizeFs     = sizeof(real) * numberOfNodes;
+    this->getParD(level)->sendProcessNeighborsAfterFtoCZ[arrayIndex].memsizeFs     = sizeof(real) * numberOfNodes;
+    this->getParH(level)->sendProcessNeighborsAfterFtoCZ[arrayIndex].numberOfFs    = this->D3Qxx * numberOfNodes;
+    this->getParD(level)->sendProcessNeighborsAfterFtoCZ[arrayIndex].numberOfFs    = this->D3Qxx * numberOfNodes;
+}
+void Parameter::setRecvProcessNeighborsAfterFtoCX(int numberOfNodes, int level, int arrayIndex)
+{
+    this->getParH(level)->recvProcessNeighborsAfterFtoCX[arrayIndex].numberOfNodes = numberOfNodes;
+    this->getParD(level)->recvProcessNeighborsAfterFtoCX[arrayIndex].numberOfNodes = numberOfNodes;
+    this->getParH(level)->recvProcessNeighborsAfterFtoCX[arrayIndex].memsizeFs     = sizeof(real) * numberOfNodes;
+    this->getParD(level)->recvProcessNeighborsAfterFtoCX[arrayIndex].memsizeFs     = sizeof(real) * numberOfNodes;
+    this->getParH(level)->recvProcessNeighborsAfterFtoCX[arrayIndex].numberOfFs    = this->D3Qxx * numberOfNodes;
+    this->getParD(level)->recvProcessNeighborsAfterFtoCX[arrayIndex].numberOfFs    = this->D3Qxx * numberOfNodes;
+}
+void Parameter::setRecvProcessNeighborsAfterFtoCY(int numberOfNodes, int level, int arrayIndex)
+{
+    this->getParH(level)->recvProcessNeighborsAfterFtoCY[arrayIndex].numberOfNodes = numberOfNodes;
+    this->getParD(level)->recvProcessNeighborsAfterFtoCY[arrayIndex].numberOfNodes = numberOfNodes;
+    this->getParH(level)->recvProcessNeighborsAfterFtoCY[arrayIndex].memsizeFs     = sizeof(real) * numberOfNodes;
+    this->getParD(level)->recvProcessNeighborsAfterFtoCY[arrayIndex].memsizeFs     = sizeof(real) * numberOfNodes;
+    this->getParH(level)->recvProcessNeighborsAfterFtoCY[arrayIndex].numberOfFs    = this->D3Qxx * numberOfNodes;
+    this->getParD(level)->recvProcessNeighborsAfterFtoCY[arrayIndex].numberOfFs    = this->D3Qxx * numberOfNodes;
+}
+void Parameter::setRecvProcessNeighborsAfterFtoCZ(int numberOfNodes, int level, int arrayIndex)
+{
+    this->getParH(level)->recvProcessNeighborsAfterFtoCZ[arrayIndex].numberOfNodes = numberOfNodes;
+    this->getParD(level)->recvProcessNeighborsAfterFtoCZ[arrayIndex].numberOfNodes = numberOfNodes;
+    this->getParH(level)->recvProcessNeighborsAfterFtoCZ[arrayIndex].memsizeFs     = sizeof(real) * numberOfNodes;
+    this->getParD(level)->recvProcessNeighborsAfterFtoCZ[arrayIndex].memsizeFs     = sizeof(real) * numberOfNodes;
+    this->getParH(level)->recvProcessNeighborsAfterFtoCZ[arrayIndex].numberOfFs    = this->D3Qxx * numberOfNodes;
+    this->getParD(level)->recvProcessNeighborsAfterFtoCZ[arrayIndex].numberOfFs    = this->D3Qxx * numberOfNodes;
 }
 void Parameter::setgeomBoundaryNormalX(std::string geomNormalX)
 {
-	ic.geomNormalX = geomNormalX;
+    ic.geomNormalX = geomNormalX;
 }
 void Parameter::setgeomBoundaryNormalY(std::string geomNormalY)
 {
-	ic.geomNormalY = geomNormalY;
+    ic.geomNormalY = geomNormalY;
 }
 void Parameter::setgeomBoundaryNormalZ(std::string geomNormalZ)
 {
-	ic.geomNormalZ = geomNormalZ;
+    ic.geomNormalZ = geomNormalZ;
 }
 void Parameter::setInflowBoundaryNormalX(std::string inflowNormalX)
 {
-	ic.inflowNormalX = inflowNormalX;
+    ic.inflowNormalX = inflowNormalX;
 }
 void Parameter::setInflowBoundaryNormalY(std::string inflowNormalY)
 {
-	ic.inflowNormalY = inflowNormalY;
+    ic.inflowNormalY = inflowNormalY;
 }
 void Parameter::setInflowBoundaryNormalZ(std::string inflowNormalZ)
 {
-	ic.inflowNormalZ = inflowNormalZ;
+    ic.inflowNormalZ = inflowNormalZ;
 }
 void Parameter::setOutflowBoundaryNormalX(std::string outflowNormalX)
 {
-	ic.outflowNormalX = outflowNormalX;
+    ic.outflowNormalX = outflowNormalX;
 }
 void Parameter::setOutflowBoundaryNormalY(std::string outflowNormalY)
 {
-	ic.outflowNormalY = outflowNormalY;
+    ic.outflowNormalY = outflowNormalY;
 }
 void Parameter::setOutflowBoundaryNormalZ(std::string outflowNormalZ)
 {
-	ic.outflowNormalZ = outflowNormalZ;
+    ic.outflowNormalZ = outflowNormalZ;
 }
 void Parameter::setMainKernel(std::string kernel)
 {
-	this->mainKernel = kernel;
+    this->mainKernel = kernel;
+    if (kernel.find("Stream") != std::string::npos)
+        this->kernelNeedsFluidNodeIndicesToRun = true;
 }
 void Parameter::setMultiKernelOn(bool isOn)
 {
-	this->multiKernelOn = isOn;
+    this->multiKernelOn = isOn;
 }
-void Parameter::setMultiKernelLevel(std::vector< int> kernelLevel)
+void Parameter::setMultiKernelLevel(std::vector<int> kernelLevel)
 {
-	this->multiKernelLevel = kernelLevel;
+    this->multiKernelLevel = kernelLevel;
 }
-void Parameter::setMultiKernel(std::vector< std::string> kernel)
+void Parameter::setMultiKernel(std::vector<std::string> kernel)
 {
-	this->multiKernel = kernel;
+    this->multiKernel = kernel;
 }
 void Parameter::setADKernel(std::string adKernel)
 {
-	this->adKernel = adKernel;
+    this->adKernel = adKernel;
 }
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//add-methods
+// add-methods
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void Parameter::addActuator(SPtr<PreCollisionInteractor> actuator)
 {
-	actuators.push_back(actuator);
+    actuators.push_back(actuator);
 }
 void Parameter::addProbe(SPtr<PreCollisionInteractor> probe)
 {
-	probes.push_back(probe);
+    probes.push_back(probe);
 }
 
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//get-methods
+// get-methods
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-double* Parameter::getForcesDouble()
+double *Parameter::getForcesDouble()
 {
-	return this->hostForcing;
+    return this->hostForcing;
 }
-real* Parameter::getForcesHost()
+real *Parameter::getForcesHost()
 {
-	return this->forcingH;
+    return this->forcingH;
 }
-real* Parameter::getForcesDev()
+real *Parameter::getForcesDev()
 {
-	return this->forcingD;
+    return this->forcingD;
 }
-double * Parameter::getQuadricLimitersDouble()
+double *Parameter::getQuadricLimitersDouble()
 {
     return this->hostQuadricLimiters;
 }
-real * Parameter::getQuadricLimitersHost()
+real *Parameter::getQuadricLimitersHost()
 {
     return this->quadricLimitersH;
 }
-real * Parameter::getQuadricLimitersDev()
+real *Parameter::getQuadricLimitersDev()
 {
     return this->quadricLimitersD;
 }
 real Parameter::getPhi()
 {
-	return Phi;
+    return Phi;
 }
 real Parameter::getAngularVelocity()
 {
-	return angularVelocity;
+    return angularVelocity;
 }
 real Parameter::getStartXHotWall()
 {
-	return this->startXHotWall;
+    return this->startXHotWall;
 }
 real Parameter::getEndXHotWall()
 {
-	return this->endXHotWall;
+    return this->endXHotWall;
 }
 unsigned int Parameter::getStepEnsight()
 {
-	return this->stepEnsight;
+    return this->stepEnsight;
 }
 unsigned int Parameter::getOutputCount()
 {
-	return this->outputCount;
+    return this->outputCount;
 }
 unsigned int Parameter::getlimitOfNodesForVTK()
 {
-	return this->limitOfNodesForVTK;
+    return this->limitOfNodesForVTK;
 }
 unsigned int Parameter::getStartTurn()
 {
-	return startTurn;
+    return startTurn;
 }
 std::shared_ptr<LBMSimulationParameter> Parameter::getParD(int level)
 {
-	return parD[level];
+    return parD[level];
 }
 std::shared_ptr<LBMSimulationParameter> Parameter::getParH(int level)
 {
-	return parH[level];
+    return parH[level];
 }
 unsigned int Parameter::getSizeMat(int level)
 {
-	return parH[level]->size_Mat;
+    return parH[level]->size_Mat;
 }
 unsigned int Parameter::getMemSizereal(int level)
 {
-	return parH[level]->mem_size_real;
+    return parH[level]->mem_size_real;
 }
 unsigned int Parameter::getMemSizeInt(int level)
 {
-	return parH[level]->mem_size_int;
+    return parH[level]->mem_size_int;
 }
 unsigned int Parameter::getMemSizeBool(int level)
 {
-	return parH[level]->mem_size_bool;
+    return parH[level]->mem_size_bool;
 }
 unsigned int Parameter::getMemSizerealYZ(int level)
 {
-	return parH[level]->mem_size_real_yz;
+    return parH[level]->mem_size_real_yz;
 }
 int Parameter::getFine()
 {
-	return fine;
+    return fine;
 }
 int Parameter::getCoarse()
 {
-	return coarse;
+    return coarse;
 }
 int Parameter::getParticleBasicLevel()
 {
-	return this->particleBasicLevel;
+    return this->particleBasicLevel;
 }
 int Parameter::getParticleInitLevel()
 {
-	return this->particleInitLevel;
+    return this->particleInitLevel;
 }
 int Parameter::getNumberOfParticles()
 {
-	return this->numberOfParticles;
+    return this->numberOfParticles;
 }
 bool Parameter::getEvenOrOdd(int level)
 {
-	return parH[level]->evenOrOdd;
+    return parH[level]->evenOrOdd;
 }
 bool Parameter::getDiffOn()
 {
-	return diffOn;
+    return diffOn;
 }
 bool Parameter::getCompOn()
 {
-	return compOn;
+    return compOn;
 }
 int Parameter::getDiffMod()
 {
-	return diffMod;
+    return diffMod;
 }
 int Parameter::getFactorNZ()
 {
-	return factor_gridNZ;
+    return factor_gridNZ;
 }
 int Parameter::getD3Qxx()
 {
-	return this->D3Qxx;
+    return this->D3Qxx;
 }
 int Parameter::getMaxLevel()
 {
-	return this->maxlevel;
+    return this->maxlevel;
 }
 unsigned int Parameter::getTStart()
 {
-	if (getDoRestart())
-	{
-		return getTimeDoRestart() + 1;
-	} 
-	else
-	{
-		return 1;
-	}
+    if (getDoRestart()) {
+        return getTimeDoRestart() + 1;
+    } else {
+        return 1;
+    }
 }
 unsigned int Parameter::getTInit()
 {
-	if (getDoRestart())
-	{
-		return getTimeDoRestart();
-	} 
-	else
-	{
-		return 0;
-	}
+    if (getDoRestart()) {
+        return getTimeDoRestart();
+    } else {
+        return 0;
+    }
 }
 unsigned int Parameter::getTEnd()
 {
-	return ic.tend;
+    return ic.tend;
 }
 unsigned int Parameter::getTOut()
 {
-	return ic.tout;
+    return ic.tout;
 }
 unsigned int Parameter::getTStartOut()
 {
-	return ic.tStartOut;
+    return ic.tStartOut;
 }
 bool Parameter::getCalcMedian()
 {
-	return ic.calcMedian;
+    return ic.calcMedian;
 }
 bool Parameter::getCalcDragLift()
 {
-	return this->calcDragLift;
+    return this->calcDragLift;
 }
 bool Parameter::getCalcCp()
 {
-	return this->calcCp;
+    return this->calcCp;
 }
 bool Parameter::getCalcParticle()
 {
-	return this->calcParticles;
+    return this->calcParticles;
 }
 bool Parameter::getWriteVeloASCIIfiles()
 {
-	return this->writeVeloASCII;
+    return this->writeVeloASCII;
 }
 bool Parameter::getCalcPlaneConc()
 {
-	return this->calcPlaneConc;
+    return this->calcPlaneConc;
 }
 int Parameter::getTimeCalcMedStart()
 {
-	return ic.tCalcMedStart;
+    return ic.tCalcMedStart;
 }
 int Parameter::getTimeCalcMedEnd()
 {
-	return ic.tCalcMedEnd;
+    return ic.tCalcMedEnd;
 }
 std::string Parameter::getOutputPath()
 {
-	return ic.oPath;
+    return ic.oPath;
 }
 std::string Parameter::getOutputPrefix()
 {
-	return ic.oPrefix;
+    return ic.oPrefix;
 }
 std::string Parameter::getFName()
 {
-	return ic.fname;
+    return ic.fname;
 }
 bool Parameter::getPrintFiles()
 {
-	return ic.printFiles;
+    return ic.printFiles;
 }
 bool Parameter::getReadGeo()
 {
-	return ic.readGeo;
+    return ic.readGeo;
+}
+bool Parameter::getCalcTurbulenceIntensity()
+{
+    return this->calcVelocityAndFluctuations;
 }
 real Parameter::getDiffusivity()
 {
-	return ic.Diffusivity;
+    return ic.Diffusivity;
 }
 real Parameter::getTemperatureInit()
 {
-	return ic.Temp;
+    return ic.Temp;
 }
 real Parameter::getTemperatureBC()
 {
-	return ic.TempBC;
+    return ic.TempBC;
 }
 real Parameter::getViscosity()
 {
-	return ic.vis;
+    return ic.vis;
 }
 real Parameter::getVelocity()
 {
-	return ic.u0;
+    return ic.u0;
 }
 real Parameter::getViscosityRatio()
 {
-	return ic.vis_ratio;
+    return ic.vis_ratio;
 }
 real Parameter::getVelocityRatio()
 {
-	return ic.u0_ratio;
+    return ic.u0_ratio;
 }
 real Parameter::getDensityRatio()
 {
-	return ic.delta_rho;
+    return ic.delta_rho;
 }
 real Parameter::getPressRatio()
 {
-	return ic.delta_press;
-}
-real Parameter::getTimeRatio()
-{
-	return this->getViscosityRatio()*pow(this->getVelocityRatio(),-2);
-}
-real Parameter::getLengthRatio()
-{
-	return this->getViscosityRatio()/this->getVelocityRatio();
-}
-real Parameter::getForceRatio()
-{
-	return this->getDensityRatio()*pow(this->getViscosityRatio(),2);
+    return ic.delta_press;
 }
 real Parameter::getRealX()
 {
-	return ic.RealX;
+    return ic.RealX;
 }
 real Parameter::getRealY()
 {
-	return ic.RealY;
+    return ic.RealY;
 }
 unsigned int Parameter::getPressInID()
 {
-	return ic.PressInID;
+    return ic.PressInID;
 }
 unsigned int Parameter::getPressOutID()
 {
-	return ic.PressOutID;
+    return ic.PressOutID;
 }
 unsigned int Parameter::getPressInZ()
 {
-	return ic.PressInZ;
+    return ic.PressInZ;
 }
 unsigned int Parameter::getPressOutZ()
 {
-	return ic.PressOutZ;
+    return ic.PressOutZ;
 }
 int Parameter::getMaxDev()
 {
-	return ic.maxdev;
+    return ic.maxdev;
 }
 int Parameter::getMyID()
 {
-	return ic.myid;
+    return ic.myid;
 }
 int Parameter::getNumprocs()
 {
-	return ic.numprocs;
+    return ic.numprocs;
 }
 std::vector<uint> Parameter::getDevices()
 {
-	return ic.devices;
+    return ic.devices;
 }
 std::string Parameter::getGeometryFileC()
 {
-	return ic.geometryFileC;
+    return ic.geometryFileC;
 }
 std::string Parameter::getGeometryFileM()
 {
-	return ic.geometryFileM;
+    return ic.geometryFileM;
 }
 std::string Parameter::getGeometryFileF()
 {
-	return ic.geometryFileF;
+    return ic.geometryFileF;
 }
 real Parameter::getRe()
 {
-	return ic.Re;
+    return ic.Re;
 }
 real Parameter::getFactorPressBC()
 {
-	return ic.factorPressBC;
+    return ic.factorPressBC;
 }
 std::vector<int> Parameter::getGridX()
 {
-	return ic.GridX;
+    return ic.GridX;
 }
 std::vector<int> Parameter::getGridY()
 {
-	return ic.GridY;
+    return ic.GridY;
 }
 std::vector<int> Parameter::getGridZ()
 {
-	return ic.GridZ;
+    return ic.GridZ;
 }
 std::vector<int> Parameter::getDistX()
 {
-	return ic.DistX;
+    return ic.DistX;
 }
 std::vector<int> Parameter::getDistY()
 {
-	return ic.DistY;
+    return ic.DistY;
 }
 std::vector<int> Parameter::getDistZ()
 {
-	return ic.DistZ;
+    return ic.DistZ;
 }
 std::vector<real> Parameter::getScaleLBMtoSI()
 {
-	return ic.scaleLBMtoSI;
+    return ic.scaleLBMtoSI;
 }
 std::vector<real> Parameter::getTranslateLBMtoSI()
 {
-	return ic.translateLBMtoSI;
+    return ic.translateLBMtoSI;
 }
 std::vector<real> Parameter::getMinCoordX()
 {
-	return ic.minCoordX;
+    return ic.minCoordX;
 }
 std::vector<real> Parameter::getMinCoordY()
 {
-	return ic.minCoordY;
+    return ic.minCoordY;
 }
 std::vector<real> Parameter::getMinCoordZ()
 {
-	return ic.minCoordZ;
+    return ic.minCoordZ;
 }
 std::vector<real> Parameter::getMaxCoordX()
 {
-	return ic.maxCoordX;
+    return ic.maxCoordX;
 }
 std::vector<real> Parameter::getMaxCoordY()
 {
-	return ic.maxCoordY;
+    return ic.maxCoordY;
 }
 std::vector<real> Parameter::getMaxCoordZ()
 {
-	return ic.maxCoordZ;
-}
-TempforBoundaryConditions* Parameter::getTempH()
-{
-	return this->TempH;
+    return ic.maxCoordZ;
 }
-TempforBoundaryConditions* Parameter::getTempD()
+TempforBoundaryConditions *Parameter::getTempH()
 {
-	return this->TempD;
+    return this->TempH;
 }
-TempVelforBoundaryConditions* Parameter::getTempVelH()
+TempforBoundaryConditions *Parameter::getTempD()
 {
-	return this->TempVelH;
+    return this->TempD;
 }
-TempVelforBoundaryConditions* Parameter::getTempVelD()
+TempVelforBoundaryConditions *Parameter::getTempVelH()
 {
-	return this->TempVelD;
+    return this->TempVelH;
 }
-TempPressforBoundaryConditions* Parameter::getTempPressH()
+TempVelforBoundaryConditions *Parameter::getTempVelD()
 {
-	return this->TempPressH;
+    return this->TempVelD;
 }
-TempPressforBoundaryConditions* Parameter::getTempPressD()
+TempPressforBoundaryConditions *Parameter::getTempPressH()
 {
-	return this->TempPressD;
+    return this->TempPressH;
 }
-std::vector<SPtr<PreCollisionInteractor>> Parameter::getActuators()
-{
-	return actuators;
-}
-std::vector<SPtr<PreCollisionInteractor>> Parameter::getProbes()
+TempPressforBoundaryConditions *Parameter::getTempPressD()
 {
-	return probes;
+    return this->TempPressD;
 }
-//unsigned int Parameter::getkInflowQ()
+// unsigned int Parameter::getkInflowQ()
 //{
 //   return this->kInflowQ;
 //}
-//unsigned int Parameter::getkOutflowQ()
+// unsigned int Parameter::getkOutflowQ()
 //{
 //   return this->kOutflowQ;
 //}
-//QforBoundaryConditions* Parameter::getQinflowH()
+// QforBoundaryConditions* Parameter::getQinflowH()
 //{
 //   return this->QinflowH;
 //}
-//QforBoundaryConditions* Parameter::getQinflowD()
+// QforBoundaryConditions* Parameter::getQinflowD()
 //{
 //   return this->QinflowD;
 //}
-//QforBoundaryConditions* Parameter::getQoutflowH()
+// QforBoundaryConditions* Parameter::getQoutflowH()
 //{
 //   return this->QoutflowH;
 //}
-//QforBoundaryConditions* Parameter::getQoutflowD()
+// QforBoundaryConditions* Parameter::getQoutflowD()
 //{
 //   return this->QoutflowD;
 //}
 std::string Parameter::getkFull()
 {
-	return ic.kFull;
+    return ic.kFull;
 }
 std::string Parameter::getgeoFull()
 {
-	return ic.geoFull;
+    return ic.geoFull;
 }
 std::string Parameter::getgeoVec()
 {
-	return ic.geoVec;
+    return ic.geoVec;
 }
 std::string Parameter::getcoordX()
 {
-	return ic.coordX;
+    return ic.coordX;
 }
 std::string Parameter::getcoordY()
 {
-	return ic.coordY;
+    return ic.coordY;
 }
 std::string Parameter::getcoordZ()
 {
-	return ic.coordZ;
+    return ic.coordZ;
 }
 std::string Parameter::getneighborX()
 {
-	return ic.neighborX;
+    return ic.neighborX;
 }
 std::string Parameter::getneighborY()
 {
-	return ic.neighborY;
+    return ic.neighborY;
 }
 std::string Parameter::getneighborZ()
 {
-	return ic.neighborZ;
+    return ic.neighborZ;
 }
 std::string Parameter::getneighborWSB()
 {
-	return ic.neighborWSB;
+    return ic.neighborWSB;
 }
 std::string Parameter::getscaleCFC()
 {
-	return ic.scaleCFC;
+    return ic.scaleCFC;
 }
 std::string Parameter::getscaleCFF()
 {
-	return ic.scaleCFF;
+    return ic.scaleCFF;
 }
 std::string Parameter::getscaleFCC()
 {
-	return ic.scaleFCC;
+    return ic.scaleFCC;
 }
 std::string Parameter::getscaleFCF()
 {
-	return ic.scaleFCF;
+    return ic.scaleFCF;
 }
 std::string Parameter::getscaleOffsetCF()
 {
-	return ic.scaleOffsetCF;
+    return ic.scaleOffsetCF;
 }
 std::string Parameter::getscaleOffsetFC()
 {
-	return ic.scaleOffsetFC;
+    return ic.scaleOffsetFC;
 }
 std::string Parameter::getgeomBoundaryBcQs()
 {
-	return ic.geomBoundaryBcQs;
+    return ic.geomBoundaryBcQs;
 }
 std::string Parameter::getgeomBoundaryBcValues()
 {
-	return ic.geomBoundaryBcValues;
+    return ic.geomBoundaryBcValues;
 }
 std::string Parameter::getnoSlipBcPos()
 {
-	return ic.noSlipBcPos;
+    return ic.noSlipBcPos;
 }
 std::string Parameter::getnoSlipBcQs()
 {
-	return ic.noSlipBcQs;
+    return ic.noSlipBcQs;
 }
 std::string Parameter::getnoSlipBcValue()
 {
-	return ic.noSlipBcValue;
+    return ic.noSlipBcValue;
 }
 std::string Parameter::getnoSlipBcValues()
 {
-	return ic.noSlipBcValues;
+    return ic.noSlipBcValues;
 }
 std::string Parameter::getslipBcPos()
 {
-	return ic.slipBcPos;
+    return ic.slipBcPos;
 }
 std::string Parameter::getslipBcQs()
 {
-	return ic.slipBcQs;
+    return ic.slipBcQs;
 }
 std::string Parameter::getslipBcValue()
 {
-	return ic.slipBcValue;
+    return ic.slipBcValue;
 }
 std::string Parameter::getpressBcPos()
 {
-	return ic.pressBcPos;
+    return ic.pressBcPos;
 }
 std::string Parameter::getpressBcQs()
 {
-	return ic.pressBcQs;
+    return ic.pressBcQs;
 }
 std::string Parameter::getpressBcValue()
 {
-	return ic.pressBcValue;
+    return ic.pressBcValue;
 }
 std::string Parameter::getpressBcValues()
 {
-	return ic.pressBcValues;
+    return ic.pressBcValues;
 }
 std::string Parameter::getvelBcQs()
 {
-	return ic.velBcQs;
+    return ic.velBcQs;
 }
 std::string Parameter::getvelBcValues()
 {
-	return ic.velBcValues;
+    return ic.velBcValues;
 }
 std::string Parameter::getinletBcQs()
 {
-	return ic.inletBcQs;
+    return ic.inletBcQs;
 }
 std::string Parameter::getinletBcValues()
 {
-	return ic.inletBcValues;
+    return ic.inletBcValues;
 }
 std::string Parameter::getoutletBcQs()
 {
-	return ic.outletBcQs;
+    return ic.outletBcQs;
 }
 std::string Parameter::getoutletBcValues()
 {
-	return ic.outletBcValues;
+    return ic.outletBcValues;
 }
 std::string Parameter::gettopBcQs()
 {
-	return ic.topBcQs;
+    return ic.topBcQs;
 }
 std::string Parameter::gettopBcValues()
 {
-	return ic.topBcValues;
+    return ic.topBcValues;
 }
 std::string Parameter::getbottomBcQs()
 {
-	return ic.bottomBcQs;
+    return ic.bottomBcQs;
 }
 std::string Parameter::getbottomBcValues()
 {
-	return ic.bottomBcValues;
+    return ic.bottomBcValues;
 }
 std::string Parameter::getfrontBcQs()
 {
-	return ic.frontBcQs;
+    return ic.frontBcQs;
 }
 std::string Parameter::getfrontBcValues()
 {
-	return ic.frontBcValues;
+    return ic.frontBcValues;
 }
 std::string Parameter::getbackBcQs()
 {
-	return ic.backBcQs;
+    return ic.backBcQs;
 }
 std::string Parameter::getbackBcValues()
 {
-	return ic.backBcValues;
+    return ic.backBcValues;
 }
 std::string Parameter::getwallBcQs()
 {
-	return ic.wallBcQs;
+    return ic.wallBcQs;
 }
 std::string Parameter::getwallBcValues()
 {
-	return ic.wallBcValues;
+    return ic.wallBcValues;
 }
 std::string Parameter::getperiodicBcQs()
 {
-	return ic.periodicBcQs;
+    return ic.periodicBcQs;
 }
 std::string Parameter::getperiodicBcValues()
 {
-	return ic.periodicBcValues;
+    return ic.periodicBcValues;
 }
 std::string Parameter::getpropellerQs()
 {
-	return ic.propellerQs;
+    return ic.propellerQs;
 }
 std::string Parameter::getpropellerValues()
 {
-	return ic.propellerValues;
+    return ic.propellerValues;
 }
 std::string Parameter::getpropellerCylinder()
 {
-	return ic.propellerCylinder;
+    return ic.propellerCylinder;
 }
 std::string Parameter::getmeasurePoints()
 {
-	return ic.measurePoints;
+    return ic.measurePoints;
 }
 std::string Parameter::getLBMvsSI()
 {
-	return ic.LBMvsSI;
+    return ic.LBMvsSI;
 }
 std::string Parameter::getnumberNodes()
 {
-	return ic.numberNodes;
+    return ic.numberNodes;
 }
 std::string Parameter::getcpTop()
 {
-	return ic.cpTop;
+    return ic.cpTop;
 }
 std::string Parameter::getcpBottom()
 {
-	return ic.cpBottom;
+    return ic.cpBottom;
 }
 std::string Parameter::getcpBottom2()
 {
-	return ic.cpBottom2;
+    return ic.cpBottom2;
 }
 std::string Parameter::getConcentration()
 {
-	return ic.concentration;
+    return ic.concentration;
 }
 std::string Parameter::getStreetVelocityFilePath()
 {
-	return ic.streetVelocity;
+    return ic.streetVelocity;
 }
 real Parameter::getclockCycleForMP()
 {
-	return ic.clockCycleForMP;
+    return ic.clockCycleForMP;
 }
 unsigned int Parameter::getTimeDoCheckPoint()
 {
-	return ic.tDoCheckPoint;
+    return ic.tDoCheckPoint;
 }
 unsigned int Parameter::getTimeDoRestart()
 {
-	return ic.tDoRestart;
+    return ic.tDoRestart;
 }
 bool Parameter::getDoCheckPoint()
 {
-	return ic.doCheckPoint;
+    return ic.doCheckPoint;
 }
 bool Parameter::getDoRestart()
 {
-	return ic.doRestart;
+    return ic.doRestart;
 }
 bool Parameter::getIsGeo()
 {
-	return ic.isGeo;
+    return ic.isGeo;
 }
 bool Parameter::getIsGeoNormal()
 {
-	return ic.isGeoNormal;
+    return ic.isGeoNormal;
 }
 bool Parameter::getIsInflowNormal()
 {
-	return ic.isInflowNormal;
+    return ic.isInflowNormal;
 }
 bool Parameter::getIsOutflowNormal()
 {
-	return ic.isOutflowNormal;
+    return ic.isOutflowNormal;
 }
 bool Parameter::getIsCp()
 {
-	return ic.isCp;
+    return ic.isCp;
 }
 bool Parameter::getConcFile()
 {
-	return ic.isConc;
+    return ic.isConc;
 }
 bool Parameter::isStreetVelocityFile()
 {
-	return ic.streetVelocityFile;
+    return ic.streetVelocityFile;
 }
 bool Parameter::getUseMeasurePoints()
 {
-	return ic.isMeasurePoints;
+    return ic.isMeasurePoints;
 }
 bool Parameter::getUseWale()
 {
-	return ic.isWale;
+    return ic.isWale;
 }
 bool Parameter::getUseAMD()
 {
-	return ic.isAMD;
-}bool Parameter::getUseTurbulentViscosity()
+    return ic.isAMD;
+}
+bool Parameter::getUseTurbulentViscosity()
 {
-	return ic.isTurbulentViscosity;
+    return ic.isTurbulentViscosity;
 }
 real Parameter::getSGSConstant()
 {
-	return ic.SGSConstant;
+    return ic.SGSConstant;
 }
 bool Parameter::getHasWallModelMonitor()
 {
-	return ic.hasWallModelMonitor;
+    return ic.hasWallModelMonitor;
+}
+std::vector<SPtr<PreCollisionInteractor>> Parameter::getActuators()
+{
+    return actuators;
+}
+std::vector<SPtr<PreCollisionInteractor>> Parameter::getProbes()
+{
+    return probes;
 }
 bool Parameter::getUseInitNeq()
 {
-	return ic.isInitNeq;
+    return ic.isInitNeq;
 }
 bool Parameter::getSimulatePorousMedia()
 {
-	return ic.simulatePorousMedia;
+    return ic.simulatePorousMedia;
 }
 
 bool Parameter::getIsF3()
 {
-	return this->isF3; 
+    return this->isF3;
 }
 
-bool Parameter::getIsBodyForce() 
-{ 
-	return this->isBodyForce; 
+bool Parameter::getIsBodyForce()
+{
+    return this->isBodyForce;
 }
 
 bool Parameter::getIsGeometryValues()
 {
-	return ic.GeometryValues;
+    return ic.GeometryValues;
 }
 bool Parameter::getCalc2ndOrderMoments()
 {
-	return ic.is2ndOrderMoments;
+    return ic.is2ndOrderMoments;
 }
 bool Parameter::getCalc3rdOrderMoments()
 {
-	return ic.is3rdOrderMoments;
+    return ic.is3rdOrderMoments;
 }
 bool Parameter::getCalcHighOrderMoments()
 {
-	return ic.isHighOrderMoments;
+    return ic.isHighOrderMoments;
 }
 bool Parameter::getIsProp()
 {
-	return ic.isProp;
+    return ic.isProp;
 }
 bool Parameter::overWritingRestart(uint t)
 {
-	return t == getTimeDoRestart();
+    return t == getTimeDoRestart();
 }
 unsigned int Parameter::getTimestepForMP()
 {
-	return ic.timeStepForMP;
+    return ic.timeStepForMP;
 }
 unsigned int Parameter::getTimestepOfCoarseLevel()
 {
-	return this->timestep;
+    return this->timestep;
 }
 double Parameter::getMemsizeGPU()
 {
-	return this->memsizeGPU;
+    return this->memsizeGPU;
 }
-//1D domain decomposition
+// 1D domain decomposition
 std::vector<std::string> Parameter::getPossNeighborFiles(std::string sor)
 {
-	if (sor=="send")
-	{
-		return this->possNeighborFilesSend;
-	} 
-	else if (sor == "recv")
-	{
-		return this->possNeighborFilesRecv;
-	}
+    if (sor == "send") {
+        return this->possNeighborFilesSend;
+    } else if (sor == "recv") {
+        return this->possNeighborFilesRecv;
+    }
     throw std::runtime_error("Parameter string invalid.");
 }
 unsigned int Parameter::getNumberOfProcessNeighbors(int level, std::string sor)
 {
-	if (sor=="send")
-	{
-		return (unsigned int)parH[level]->sendProcessNeighbor.size();
-	} 
-	else if (sor == "recv")
-	{
-		return (unsigned int)parH[level]->recvProcessNeighbor.size();
-	}
+    if (sor == "send") {
+        return (unsigned int)parH[level]->sendProcessNeighbor.size();
+    } else if (sor == "recv") {
+        return (unsigned int)parH[level]->recvProcessNeighbor.size();
+    }
     throw std::runtime_error("Parameter string invalid.");
 }
 bool Parameter::getIsNeighbor()
 {
-	return this->isNeigbor;
+    return this->isNeigbor;
 }
-//3D domain decomposition
+// 3D domain decomposition
 std::vector<std::string> Parameter::getPossNeighborFilesX(std::string sor)
 {
-	if (sor=="send")
-	{
-		return this->possNeighborFilesSendX;
-	} 
-	else if (sor == "recv")
-	{
-		return this->possNeighborFilesRecvX;
-	}
+    if (sor == "send") {
+        return this->possNeighborFilesSendX;
+    } else if (sor == "recv") {
+        return this->possNeighborFilesRecvX;
+    }
     throw std::runtime_error("Parameter string invalid.");
 }
 std::vector<std::string> Parameter::getPossNeighborFilesY(std::string sor)
 {
-	if (sor=="send")
-	{
-		return this->possNeighborFilesSendY;
-	} 
-	else if (sor == "recv")
-	{
-		return this->possNeighborFilesRecvY;
-	}
+    if (sor == "send") {
+        return this->possNeighborFilesSendY;
+    } else if (sor == "recv") {
+        return this->possNeighborFilesRecvY;
+    }
     throw std::runtime_error("Parameter string invalid.");
 }
 std::vector<std::string> Parameter::getPossNeighborFilesZ(std::string sor)
 {
-	if (sor=="send")
-	{
-		return this->possNeighborFilesSendZ;
-	} 
-	else if (sor == "recv")
-	{
-		return this->possNeighborFilesRecvZ;
-	}
+    if (sor == "send") {
+        return this->possNeighborFilesSendZ;
+    } else if (sor == "recv") {
+        return this->possNeighborFilesRecvZ;
+    }
     throw std::runtime_error("Parameter string invalid.");
 }
 unsigned int Parameter::getNumberOfProcessNeighborsX(int level, std::string sor)
 {
-	if (sor=="send")
-	{
-		return (unsigned int)parH[level]->sendProcessNeighborX.size();
-	} 
-	else if (sor == "recv")
-	{
-		return (unsigned int)parH[level]->recvProcessNeighborX.size();
-	}
-    throw std::runtime_error("Parameter string invalid.");
+    if (sor == "send") {
+        return (unsigned int)parH[level]->sendProcessNeighborX.size();
+    } else if (sor == "recv") {
+        return (unsigned int)parH[level]->recvProcessNeighborX.size();
+    }
+    throw std::runtime_error("getNumberOfProcessNeighborsX: Parameter string invalid.");
 }
 unsigned int Parameter::getNumberOfProcessNeighborsY(int level, std::string sor)
 {
-	if (sor=="send")
-	{
-		return (unsigned int)parH[level]->sendProcessNeighborY.size();
-	} 
-	else if (sor == "recv")
-	{
-		return (unsigned int)parH[level]->recvProcessNeighborY.size();
-	}
-    throw std::runtime_error("Parameter string invalid.");
+    if (sor == "send") {
+        return (unsigned int)parH[level]->sendProcessNeighborY.size();
+    } else if (sor == "recv") {
+        return (unsigned int)parH[level]->recvProcessNeighborY.size();
+    }
+    throw std::runtime_error("getNumberOfProcessNeighborsY: Parameter string invalid.");
 }
 unsigned int Parameter::getNumberOfProcessNeighborsZ(int level, std::string sor)
 {
-	if (sor=="send")
-	{
-		return (unsigned int)parH[level]->sendProcessNeighborZ.size();
-	} 
-	else if (sor == "recv")
-	{
-		return (unsigned int)parH[level]->recvProcessNeighborZ.size();
-	}
-    throw std::runtime_error("Parameter string invalid.");
+    if (sor == "send") {
+        return (unsigned int)parH[level]->sendProcessNeighborZ.size();
+    } else if (sor == "recv") {
+        return (unsigned int)parH[level]->recvProcessNeighborZ.size();
+    }
+    throw std::runtime_error("getNumberOfProcessNeighborsZ: Parameter string invalid.");
 }
 
 bool Parameter::getIsNeighborX()
 {
-	return this->isNeigborX;
+    return this->isNeigborX;
 }
 bool Parameter::getIsNeighborY()
 {
-	return this->isNeigborY;
+    return this->isNeigborY;
 }
 bool Parameter::getIsNeighborZ()
 {
-	return this->isNeigborZ;
+    return this->isNeigborZ;
 }
 std::string Parameter::getgeomBoundaryNormalX()
 {
-	return ic.geomNormalX;
+    return ic.geomNormalX;
 }
 std::string Parameter::getgeomBoundaryNormalY()
 {
-	return ic.geomNormalY;
+    return ic.geomNormalY;
 }
 std::string Parameter::getgeomBoundaryNormalZ()
 {
-	return ic.geomNormalZ;
+    return ic.geomNormalZ;
 }
 std::string Parameter::getInflowBoundaryNormalX()
 {
-	return ic.inflowNormalX;
+    return ic.inflowNormalX;
 }
 std::string Parameter::getInflowBoundaryNormalY()
 {
-	return ic.inflowNormalY;
+    return ic.inflowNormalY;
 }
 std::string Parameter::getInflowBoundaryNormalZ()
 {
-	return ic.inflowNormalZ;
+    return ic.inflowNormalZ;
 }
 std::string Parameter::getOutflowBoundaryNormalX()
 {
-	return ic.outflowNormalX;
+    return ic.outflowNormalX;
 }
 std::string Parameter::getOutflowBoundaryNormalY()
 {
-	return ic.outflowNormalY;
+    return ic.outflowNormalY;
 }
 std::string Parameter::getOutflowBoundaryNormalZ()
 {
-	return ic.outflowNormalZ;
+    return ic.outflowNormalZ;
 }
-curandState* Parameter::getRandomState()
+curandState *Parameter::getRandomState()
 {
-	return this->devState;
+    return this->devState;
 }
 
 std::string Parameter::getMainKernel()
 {
-	return mainKernel;
+    return mainKernel;
 }
 bool Parameter::getMultiKernelOn()
 {
-	return multiKernelOn;
+    return multiKernelOn;
 }
-std::vector< int> Parameter::getMultiKernelLevel()
+std::vector<int> Parameter::getMultiKernelLevel()
 {
-	return multiKernelLevel;
+    return multiKernelLevel;
 }
 std::vector<std::string> Parameter::getMultiKernel()
 {
-	return multiKernel;
+    return multiKernel;
 }
 std::string Parameter::getADKernel()
 {
-	return adKernel;
+    return adKernel;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void Parameter::setInitialCondition(std::function<void(real,real,real,real&,real&,real&,real&)> initialCondition)
+void Parameter::setInitialCondition(
+    std::function<void(real, real, real, real &, real &, real &, real &)> initialCondition)
 {
     this->initialCondition = initialCondition;
 }
 
-std::function<void(real,real,real,real&,real&,real&,real&)>& Parameter::getInitialCondition()
+std::function<void(real, real, real, real &, real &, real &, real &)> &Parameter::getInitialCondition()
 {
     return this->initialCondition;
 }
 
 real Parameter::TrafoXtoWorld(int CoordX, int level)
 {
-	return (parH[level]->mTtoWx*CoordX+parH[level]->cTtoWx);
+    return (parH[level]->mTtoWx * CoordX + parH[level]->cTtoWx);
 }
 real Parameter::TrafoYtoWorld(int CoordY, int level)
 {
-	return (parH[level]->mTtoWy*CoordY+parH[level]->cTtoWy);
+    return (parH[level]->mTtoWy * CoordY + parH[level]->cTtoWy);
 }
 real Parameter::TrafoZtoWorld(int CoordZ, int level)
 {
-	return (parH[level]->mTtoWz*CoordZ+parH[level]->cTtoWz);
+    return (parH[level]->mTtoWz * CoordZ + parH[level]->cTtoWz);
 }
 real Parameter::TrafoXtoMGsWorld(int CoordX, int level)
 {
-	real temp = 0;
-	for (int i = 0; i <= level; i++)
-	{
-		temp += (parH[i]->XdistKn + 0.25f) * 2.f * parH[i]->dx;
-	}
-	temp += (real)((CoordX ) * parH[level]->dx);
-	return temp;
+    real temp = 0;
+    for (int i = 0; i <= level; i++) {
+        temp += (parH[i]->XdistKn + 0.25f) * 2.f * parH[i]->dx;
+    }
+    temp += (real)((CoordX)*parH[level]->dx);
+    return temp;
 }
 real Parameter::TrafoYtoMGsWorld(int CoordY, int level)
 {
-	real temp = 0;
-	for (int i = 0; i <= level; i++)
-	{
-		temp += (parH[i]->YdistKn + 0.25f) * 2.f * parH[i]->dx;
-	}
-	temp += (real)((CoordY ) * parH[level]->dx);
-	return temp;
+    real temp = 0;
+    for (int i = 0; i <= level; i++) {
+        temp += (parH[i]->YdistKn + 0.25f) * 2.f * parH[i]->dx;
+    }
+    temp += (real)((CoordY)*parH[level]->dx);
+    return temp;
 }
 real Parameter::TrafoZtoMGsWorld(int CoordZ, int level)
 {
-	real temp = 0;
-	for (int i = 0; i <= level; i++)
-	{
-		temp += (parH[i]->ZdistKn + 0.25f) * 2.f * parH[i]->dx;
-	}
-	temp += (real)((CoordZ) * parH[level]->dx);
-	return temp;
+    real temp = 0;
+    for (int i = 0; i <= level; i++) {
+        temp += (parH[i]->ZdistKn + 0.25f) * 2.f * parH[i]->dx;
+    }
+    temp += (real)((CoordZ)*parH[level]->dx);
+    return temp;
 }
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void Parameter::setUseStreams(bool useStreams)
+{
+    if (useStreams) {
+        if (this->getNumprocs() != 1) {
+            this->useStreams = useStreams;
+            this->cudaStreamManager = std::make_unique<CudaStreamManager>();
+            return;
+        } else {
+            std::cout << "Can't use streams with only one process!" << std::endl;
+        }
+    }
+    this->useStreams = false;
+}
+
+bool Parameter::getUseStreams()
+{
+    return this->useStreams;
+}
+
+std::unique_ptr<CudaStreamManager> &Parameter::getStreamManager()
+{
+    return this->cudaStreamManager;
+}
+
+bool Parameter::getKernelNeedsFluidNodeIndicesToRun()
+{
+    return this->kernelNeedsFluidNodeIndicesToRun;
+}
+
+void Parameter::findEdgeNodesCommMultiGPU()
+{
+    for (uint level = 0; level < parH.size(); level++) {
+        findEdgeNodesXY(level);
+        findEdgeNodesXZ(level);
+        findEdgeNodesYZ(level);
+    }
+}
+
+void Parameter::findEdgeNodesXY(int level)
+{
+    int indexOfProcessNeighborSend;
+    int indexInSendBuffer;
+    for (uint i = 0; i < (unsigned int)(this->getNumberOfProcessNeighborsX(level, "recv")); i++) {
+        for (int j = 0; j < parH[level]->recvProcessNeighborX[i].numberOfNodes; j++) {
+            int index       = parH[level]->recvProcessNeighborX[i].index[j];
+            bool foundIndex = findIndexInSendNodesXY(level, index, indexOfProcessNeighborSend, indexInSendBuffer);
+            if (foundIndex) {
+                this->parH[level]->edgeNodesXtoY.emplace_back(i, j, indexOfProcessNeighborSend, indexInSendBuffer);
+            }
+        }
+    }
+}
+
+bool Parameter::findIndexInSendNodesXY(int level, int index, int &indexOfProcessNeighborSend, int &indexInSendBuffer)
+{
+    for (uint k = 0; k < (unsigned int)(this->getNumberOfProcessNeighborsY(level, "send")); k++) {
+        for (int l = 0; l < parH[level]->sendProcessNeighborY[k].numberOfNodes; l++) {
+            if (parH[level]->sendProcessNeighborY[k].index[l] == index) {
+                indexOfProcessNeighborSend = k;
+                indexInSendBuffer          = l;
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+void Parameter::findEdgeNodesXZ(int level)
+{
+    int indexOfProcessNeighborSend;
+    int indexInSendBuffer;
+    for (uint i = 0; i < (unsigned int)(this->getNumberOfProcessNeighborsX(level, "recv")); i++) {
+        for (int j = 0; j < parH[level]->recvProcessNeighborX[i].numberOfNodes; j++) {
+            int index       = parH[level]->recvProcessNeighborX[i].index[j];
+            bool foundIndex = findIndexInSendNodesXZ(level, index, indexOfProcessNeighborSend, indexInSendBuffer);
+            if (foundIndex) {
+                this->parH[level]->edgeNodesXtoZ.emplace_back(i, j, indexOfProcessNeighborSend, indexInSendBuffer);
+            }
+        }
+    }
+}
+
+bool Parameter::findIndexInSendNodesXZ(int level, int index, int &indexOfProcessNeighborSend, int &indexInSendBuffer)
+{
+    for (uint k = 0; k < (unsigned int)(this->getNumberOfProcessNeighborsZ(level, "send")); k++) {
+        for (int l = 0; l < parH[level]->sendProcessNeighborZ[k].numberOfNodes; l++) {
+            if (parH[level]->sendProcessNeighborZ[k].index[l] == index) {
+                indexOfProcessNeighborSend = k;
+                indexInSendBuffer          = l;
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+void Parameter::findEdgeNodesYZ(int level)
+{
+    int indexOfProcessNeighborSend;
+    int indexInSendBuffer;
+    for (uint i = 0; i < (unsigned int)(this->getNumberOfProcessNeighborsY(level, "recv")); i++) {
+        for (int j = 0; j < parH[level]->recvProcessNeighborY[i].numberOfNodes; j++) {
+            int index       = parH[level]->recvProcessNeighborY[i].index[j];
+            bool foundIndex = findIndexInSendNodesYZ(level, index, indexOfProcessNeighborSend, indexInSendBuffer);
+            if (foundIndex) {
+                this->parH[level]->edgeNodesYtoZ.emplace_back(i, j, indexOfProcessNeighborSend, indexInSendBuffer);
+            }
+        }
+    }
+}
+
+bool Parameter::findIndexInSendNodesYZ(int level, int index, int &indexOfProcessNeighborSend, int &indexInSendBuffer)
+{
+    for (uint k = 0; k < (unsigned int)(this->getNumberOfProcessNeighborsZ(level, "send")); k++) {
+        for (int l = 0; l < parH[level]->sendProcessNeighborZ[k].numberOfNodes; l++) {
+            if (parH[level]->sendProcessNeighborZ[k].index[l] == index) {
+                indexOfProcessNeighborSend = k;
+                indexInSendBuffer          = l;
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+void Parameter::initProcessNeighborsAfterFtoCX(int level)
+{
+    this->getParH(level)->sendProcessNeighborsAfterFtoCX.resize(this->getParH(level)->sendProcessNeighborX.size());
+    this->getParH(level)->recvProcessNeighborsAfterFtoCX.resize(this->getParH(level)->recvProcessNeighborX.size());
+    this->getParD(level)->sendProcessNeighborsAfterFtoCX.resize(
+        this->getParH(level)->sendProcessNeighborsAfterFtoCX.size());
+    this->getParD(level)->recvProcessNeighborsAfterFtoCX.resize(
+        this->getParH(level)->recvProcessNeighborsAfterFtoCX.size());
+}
+
+void Parameter::initProcessNeighborsAfterFtoCY(int level)
+{
+    this->getParH(level)->sendProcessNeighborsAfterFtoCY.resize(this->getParH(level)->sendProcessNeighborY.size());
+    this->getParH(level)->recvProcessNeighborsAfterFtoCY.resize(this->getParH(level)->recvProcessNeighborY.size());
+    this->getParD(level)->sendProcessNeighborsAfterFtoCY.resize(
+        this->getParH(level)->sendProcessNeighborsAfterFtoCY.size());
+    this->getParD(level)->recvProcessNeighborsAfterFtoCY.resize(
+        this->getParH(level)->recvProcessNeighborsAfterFtoCY.size());
+}
+
+void Parameter::initProcessNeighborsAfterFtoCZ(int level)
+{
+    this->getParH(level)->sendProcessNeighborsAfterFtoCZ.resize(this->getParH(level)->sendProcessNeighborZ.size());
+    this->getParH(level)->recvProcessNeighborsAfterFtoCZ.resize(this->getParH(level)->recvProcessNeighborZ.size());
+    this->getParD(level)->sendProcessNeighborsAfterFtoCZ.resize(
+        this->getParH(level)->sendProcessNeighborsAfterFtoCZ.size());
+    this->getParD(level)->recvProcessNeighborsAfterFtoCZ.resize(
+        this->getParH(level)->recvProcessNeighborsAfterFtoCZ.size());
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
index 48cf410ff8b700ef69d26883c5ef22048f9fd322..6646cdbfbe70d4be9d35606309b839811cf6ec36 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
@@ -1,28 +1,28 @@
 //=======================================================================================
-// ____          ____    __    ______     __________   __      __       __        __         
-// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
-//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
-//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
-//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
-//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
-//      \    \  |    |   ________________________________________________________________    
-//       \    \ |    |  |  ______________________________________________________________|   
-//        \    \|    |  |  |         __          __     __     __     ______      _______    
-//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
-//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+// ____          ____    __    ______     __________   __      __       __        __
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|
+//      \    \  |    |   ________________________________________________________________
+//       \    \ |    |  |  ______________________________________________________________|
+//        \    \|    |  |  |         __          __     __     __     ______      _______
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______
 //           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
-//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/
 //
-//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can
 //  redistribute it and/or modify it under the terms of the GNU General Public
-//  License as published by the Free Software Foundation, either version 3 of 
+//  License as published by the Free Software Foundation, either version 3 of
 //  the License, or (at your option) any later version.
-//  
-//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
-//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
-//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 //  for more details.
-//  
+//
 //  You should have received a copy of the GNU General Public License along
 //  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
@@ -33,10 +33,10 @@
 #ifndef GPU_PARAMETER_H
 #define GPU_PARAMETER_H
 
-#include <vector>
-#include <string>
-#include <memory>
 #include <functional>
+#include <memory>
+#include <string>
+#include <vector>
 
 #include "LBM/D3Q27.h"
 #include "LBM/LB.h"
@@ -44,7 +44,6 @@
 
 #include "VirtualFluids_GPU_export.h"
 
-
 struct curandStateXORWOW;
 typedef struct curandStateXORWOW curandState;
 namespace vf
@@ -53,13 +52,13 @@ namespace basics
 {
 class ConfigurationFile;
 }
-}
+} // namespace vf
+class CudaStreamManager;
 
 //! \struct LBMSimulationParameter
 //! \brief struct holds and manages the LB-parameter of the simulation
 //! \brief For this purpose it holds structures and pointer for host and device data, respectively.
-struct LBMSimulationParameter
-{
+struct LBMSimulationParameter {
     bool evenOrOdd;
     unsigned int numberofthreads;
 
@@ -119,6 +118,11 @@ struct LBMSimulationParameter
     real *turbViscosity;
     real *gSij, *gSDij, *gDxvx, *gDyvx, *gDzvx, *gDxvy, *gDyvy, *gDzvy, *gDxvz, *gDyvz, *gDzvz; // DebugInformation
 
+    // turbulence intensity //
+    real *vx_mean, *vy_mean, *vz_mean;       // means
+    real *vxx, *vyy, *vzz, *vxy, *vxz, *vyz; // fluctuations
+    std::vector<real> turbulenceIntensity;
+
     // macroscopic values//////
     real *vx, *vy, *vz, *rho;
     real *vx_SP, *vy_SP, *vz_SP, *rho_SP, *press_SP;
@@ -182,8 +186,14 @@ struct LBMSimulationParameter
     unsigned int mem_size_kCF;
     unsigned int mem_size_kFC;
 
+    InterpolationCellFC intFCBorder;
+    InterpolationCellFC intFCBulk;
+    InterpolationCellCF intCFBorder;
+    InterpolationCellCF intCFBulk;
+
     // offset//////////////////
     OffsetCF offCF;
+    OffsetCF offCFBulk;
     OffsetFC offFC;
     unsigned int mem_size_kCF_off;
     unsigned int mem_size_kFC_off;
@@ -205,7 +215,7 @@ struct LBMSimulationParameter
     unsigned int kPressQ = 0, kPressQread;
 
     WallModelParameters wallModel;
-    
+
     // testRoundoffError
     Distributions27 kDistTestRE;
 
@@ -294,6 +304,13 @@ struct LBMSimulationParameter
     std::vector<ProcessNeighbor27> recvProcessNeighborX;
     std::vector<ProcessNeighbor27> recvProcessNeighborY;
     std::vector<ProcessNeighbor27> recvProcessNeighborZ;
+
+    std::vector<ProcessNeighbor27> sendProcessNeighborsAfterFtoCX;
+    std::vector<ProcessNeighbor27> sendProcessNeighborsAfterFtoCY;
+    std::vector<ProcessNeighbor27> sendProcessNeighborsAfterFtoCZ;
+    std::vector<ProcessNeighbor27> recvProcessNeighborsAfterFtoCX;
+    std::vector<ProcessNeighbor27> recvProcessNeighborsAfterFtoCY;
+    std::vector<ProcessNeighbor27> recvProcessNeighborsAfterFtoCZ;
     ///////////////////////////////////////////////////////
     // 3D domain decomposition convection diffusion
     std::vector<ProcessNeighbor27> sendProcessNeighborADX;
@@ -311,17 +328,40 @@ struct LBMSimulationParameter
     std::vector<ProcessNeighborF3> recvProcessNeighborF3Y;
     std::vector<ProcessNeighborF3> recvProcessNeighborF3Z;
     ////////////////////////////////////////////////////////////////////////////
+    // 3D domain decomposition: position (index in array) of corner nodes in ProcessNeighbor27
+    struct EdgeNodePositions {
+        int indexOfProcessNeighborRecv;
+        int indexInRecvBuffer;
+        int indexOfProcessNeighborSend;
+        int indexInSendBuffer;
+        EdgeNodePositions(int indexOfProcessNeighborRecv, int indexInRecvBuffer, int indexOfProcessNeighborSend,
+                          int indexInSendBuffer)
+            : indexOfProcessNeighborRecv(indexOfProcessNeighborRecv), indexInRecvBuffer(indexInRecvBuffer),
+              indexOfProcessNeighborSend(indexOfProcessNeighborSend), indexInSendBuffer(indexInSendBuffer)
+        {
+        }
+    };
+    std::vector<EdgeNodePositions> edgeNodesXtoY;
+    std::vector<EdgeNodePositions> edgeNodesXtoZ;
+    std::vector<EdgeNodePositions> edgeNodesYtoZ;
+
+    ///////////////////////////////////////////////////////
+    uint *fluidNodeIndices;
+    uint numberOfFluidNodes;
+    uint *fluidNodeIndicesBorder;
+    uint numberOffluidNodesBorder;
 };
 
 class VIRTUALFLUIDS_GPU_EXPORT Parameter
 {
 public:
     Parameter(const vf::basics::ConfigurationFile &configData, int numberOfProcesses, int myId);
+    ~Parameter();
     void initLBMSimulationParameter();
 
     std::shared_ptr<LBMSimulationParameter> getParH(int level);
     std::shared_ptr<LBMSimulationParameter> getParD(int level);
-    
+
     void copyMeasurePointsArrayToVector(int lev);
 
     //////////////////////////////////////////////////////////////////////////
@@ -350,6 +390,7 @@ public:
     void setTOut(unsigned int tout);
     void setTStartOut(unsigned int tStartOut);
     void setTimestepOfCoarseLevel(unsigned int timestep);
+    void setCalcTurbulenceIntensity(bool calcVelocityAndFluctuations);
     void setCalcMedian(bool calcMedian);
     void setCalcDragLift(bool calcDragLift);
     void setCalcCp(bool calcCp);
@@ -454,8 +495,8 @@ public:
     void setUseMeasurePoints(bool useMeasurePoints);
     void setUseWale(bool useWale);
     void setUseTurbulentViscosity(bool useTurbulentViscosity);
-    void setUseAMD( bool useAMD);
-    void setSGSConstant( real SGSConstant);
+    void setUseAMD(bool useAMD);
+    void setSGSConstant(real SGSConstant);
     void setHasWallModelMonitor(bool hasWallModelMonitor);
     void setUseInitNeq(bool useInitNeq);
     void setSimulatePorousMedia(bool simulatePorousMedia);
@@ -507,6 +548,12 @@ public:
     void setIsNeighborX(bool isNeighbor);
     void setIsNeighborY(bool isNeighbor);
     void setIsNeighborZ(bool isNeighbor);
+    void setSendProcessNeighborsAfterFtoCX(int numberOfNodes, int level, int arrayIndex);
+    void setSendProcessNeighborsAfterFtoCY(int numberOfNodes, int level, int arrayIndex);
+    void setSendProcessNeighborsAfterFtoCZ(int numberOfNodes, int level, int arrayIndex);
+    void setRecvProcessNeighborsAfterFtoCX(int numberOfNodes, int level, int arrayIndex);
+    void setRecvProcessNeighborsAfterFtoCY(int numberOfNodes, int level, int arrayIndex);
+    void setRecvProcessNeighborsAfterFtoCZ(int numberOfNodes, int level, int arrayIndex);
     // void setkInflowQ(unsigned int kInflowQ);
     // void setkOutflowQ(unsigned int kOutflowQ);
     // void setQinflowH(QforBoundaryConditions* QinflowH);
@@ -531,10 +578,10 @@ public:
 
     void setADKernel(std::string adKernel);
 
-    //adder
+    // adder
 
-	void addActuator(SPtr<PreCollisionInteractor> actuator);
-	void addProbe(SPtr<PreCollisionInteractor> probes);
+    void addActuator(SPtr<PreCollisionInteractor> actuator);
+    void addProbe(SPtr<PreCollisionInteractor> probes);
 
     // getter
     double *getForcesDouble();
@@ -556,6 +603,7 @@ public:
     bool getCompOn();
     bool getPrintFiles();
     bool getReadGeo();
+    bool getCalcTurbulenceIntensity();
     bool getCalcMedian();
     bool getCalcDragLift();
     bool getCalcCp();
@@ -664,10 +712,10 @@ public:
     real getViscosityRatio();
     real getVelocityRatio();
     real getDensityRatio();
-    real getPressRatio();    
+    real getPressRatio();
     real getTimeRatio();
     real getLengthRatio();
-    real getForceRatio();    
+    real getForceRatio();
     real getRealX();
     real getRealY();
     real getRe();
@@ -774,23 +822,27 @@ public:
 
     std::vector<std::shared_ptr<LBMSimulationParameter>> parH = std::vector<std::shared_ptr<LBMSimulationParameter>>(1);
     std::vector<std::shared_ptr<LBMSimulationParameter>> parD = std::vector<std::shared_ptr<LBMSimulationParameter>>(1);
+
+    ////////////////////////////////////////////////////////////////////////////
+
 private:
     void readConfigData(const vf::basics::ConfigurationFile &configData);
 
-    bool compOn { false };
-    bool diffOn { false };
-    bool isF3 { false };
-    bool calcDragLift { false };
-    bool calcCp { false };
-    bool writeVeloASCII { false };
-    bool calcPlaneConc { false };
-    bool isBodyForce { false };
-    int diffMod {27};
-    int maxlevel {0};
-    int coarse {0};
-    int fine {0};
-    int factor_gridNZ {2};
-    int D3Qxx {27};
+    bool compOn{ false };
+    bool diffOn{ false };
+    bool isF3{ false };
+    bool calcDragLift{ false };
+    bool calcCp{ false };
+    bool writeVeloASCII{ false };
+    bool calcPlaneConc{ false };
+    bool calcVelocityAndFluctuations{ false };
+    bool isBodyForce{ false };
+    int diffMod{ 27 };
+    int maxlevel{ 0 };
+    int coarse{ 0 };
+    int fine{ 0 };
+    int factor_gridNZ{ 2 };
+    int D3Qxx{ 27 };
     InitCondition ic;
     double memsizeGPU;
     unsigned int limitOfNodesForVTK;
@@ -798,21 +850,21 @@ private:
     unsigned int timestep;
 
     // Kernel
-    std::string mainKernel { "CumulantK17Comp" };
-    bool multiKernelOn { false };
+    std::string mainKernel{ "CumulantK17Comp" };
+    bool multiKernelOn{ false };
     std::vector<int> multiKernelLevel;
     std::vector<std::string> multiKernel;
-
+    bool kernelNeedsFluidNodeIndicesToRun = false;
     std::string adKernel;
 
     //////////////////////////////////////////////////////////////////////////
     // particles
-    int particleBasicLevel {0};
-    int particleInitLevel {0};
-    int numberOfParticles {0};
-    bool calcParticles {false};
-    real startXHotWall {(real)0.0};
-    real endXHotWall {(real)0.0};
+    int particleBasicLevel{ 0 };
+    int particleInitLevel{ 0 };
+    int numberOfParticles{ 0 };
+    bool calcParticles{ false };
+    real startXHotWall{ (real)0.0 };
+    real endXHotWall{ (real)0.0 };
     //////////////////////////////////////////////////////////////////////////
     // CUDA random number generation
     curandState *devState;
@@ -826,13 +878,13 @@ private:
     TempPressforBoundaryConditions *TempPressH, *TempPressD;
 
     // Drehung///////////////
-    real Phi {0.0};
-	real angularVelocity;
+    real Phi{ 0.0 };
+    real angularVelocity;
     unsigned int startTurn;
 
     // PreCollisionInteractors //////////////
     std::vector<SPtr<PreCollisionInteractor>> actuators;
-	std::vector<SPtr<PreCollisionInteractor>> probes;
+    std::vector<SPtr<PreCollisionInteractor>> probes;
 
     // Step of Ensight writing//
     unsigned int stepEnsight;
@@ -860,6 +912,37 @@ private:
     ////////////////////////////////////////////////////////////////////////////
     // initial condition
     std::function<void(real, real, real, real &, real &, real &, real &)> initialCondition;
+
+    ////////////////////////////////////////////////////////////////////////////
+    // cuda streams
+
+    //! determines whether streams and thus communication hiding should be used
+    bool useStreams{ false };
+    std::unique_ptr<CudaStreamManager> cudaStreamManager;
+
+public:
+    //! \brief sets whether streams and thus communication hiding should be used
+    //! \details This function is only useful for simulations on multiple GPUs. If there is only one MPI process, the
+    //! passed value is automatically overwritten with false.
+    void setUseStreams(bool useStreams);
+    bool getUseStreams();
+    std::unique_ptr<CudaStreamManager> &getStreamManager();
+    bool getKernelNeedsFluidNodeIndicesToRun();
+
+    void initProcessNeighborsAfterFtoCX(int level);
+    void initProcessNeighborsAfterFtoCY(int level);
+    void initProcessNeighborsAfterFtoCZ(int level);
+
+    void findEdgeNodesCommMultiGPU();
+    bool useReducedCommunicationAfterFtoC{ true };
+
+private:
+    void findEdgeNodesXY(int level);
+    bool findIndexInSendNodesXY(int level, int index, int &indexOfProcessNeighborSend, int &indexInSendBuffer);
+    void findEdgeNodesXZ(int level);
+    bool findIndexInSendNodesXZ(int level, int index, int &indexOfProcessNeighborSend, int &indexInSendBuffer);
+    void findEdgeNodesYZ(int level);
+    bool findIndexInSendNodesYZ(int level, int index, int &indexOfProcessNeighborSend, int &indexInSendBuffer);
 };
 
 #endif
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/ParameterTest.cpp b/src/gpu/VirtualFluids_GPU/Parameter/ParameterTest.cpp
index aa0551632e566768aaa9b087c072f665d6f7bc3d..a99c7b11c9cf6bdc5a6c1b29b3eb1d2328facdd4 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/ParameterTest.cpp
+++ b/src/gpu/VirtualFluids_GPU/Parameter/ParameterTest.cpp
@@ -1,22 +1,20 @@
 #include <gmock/gmock.h>
 
+#include <filesystem>
 #include <iostream>
 #include <string>
-#include <filesystem>
 
 #include "Parameter.h"
 #include <basics/config/ConfigurationFile.h>
 
-
-auto RealEq = [](auto value) { 
+auto RealEq = [](auto value) {
 #ifdef VF_DOUBLE_ACCURACY
-    return testing::DoubleEq(value); 
-#else 
+    return testing::DoubleEq(value);
+#else
     return testing::FloatEq(value);
 #endif
 };
 
-
 TEST(ParameterTest, passingEmptyFileWithoutPath_ShouldThrow)
 {
     // assuming that the config files is stored parallel to this file.
@@ -50,7 +48,7 @@ TEST(ParameterTest, check_all_Parameter_CanBePassedToConstructor)
 
     // test optional parameter
     EXPECT_THAT(para.getMaxDev(), testing::Eq(2));
-    EXPECT_THAT(para.getDevices(), testing::ElementsAreArray({2,3}));
+    EXPECT_THAT(para.getDevices(), testing::ElementsAreArray({ 2, 3 }));
     EXPECT_THAT(para.getOutputPrefix(), testing::Eq("MyPrefix"));
     EXPECT_THAT(para.getPrintFiles(), testing::Eq(true));
     EXPECT_THAT(para.getIsGeometryValues(), testing::Eq(true));
@@ -105,16 +103,16 @@ TEST(ParameterTest, check_all_Parameter_CanBePassedToConstructor)
     EXPECT_THAT(para.getclockCycleForMP(), RealEq(0.4));
     EXPECT_THAT(para.getTimestepForMP(), testing::Eq(4));
 
-    std::vector<real> forces {2.0,2.1,2.2};
-    double* forces_actual = para.getForcesDouble();
+    std::vector<real> forces{ 2.0, 2.1, 2.2 };
+    double *forces_actual = para.getForcesDouble();
     for (size_t i = 0; i < forces.size(); ++i) {
-         EXPECT_THAT((real)forces_actual[i], RealEq(forces[i]));
+        EXPECT_THAT((real)forces_actual[i], RealEq(forces[i]));
     }
 
-    std::vector<real> limiters {3.0,3.1,3.2};
-    double* limiters_actual = para.getQuadricLimitersDouble();
+    std::vector<real> limiters{ 3.0, 3.1, 3.2 };
+    double *limiters_actual = para.getQuadricLimitersDouble();
     for (size_t i = 0; i < limiters.size(); ++i) {
-         EXPECT_THAT((real)limiters_actual[i], RealEq(limiters[i]));
+        EXPECT_THAT((real)limiters_actual[i], RealEq(limiters[i]));
     }
 
     EXPECT_THAT(para.getCalcParticle(), testing::Eq(true));
@@ -130,29 +128,185 @@ TEST(ParameterTest, check_all_Parameter_CanBePassedToConstructor)
     EXPECT_THAT(para.getDoRestart(), testing::Eq(true));
     EXPECT_THAT(para.getMaxLevel(), testing::Eq(1)); // NOGL - 1
 
-    EXPECT_THAT(para.getGridX(), testing::ElementsAreArray({100, 101}));
-    EXPECT_THAT(para.getGridY(), testing::ElementsAreArray({200, 201}));
-    EXPECT_THAT(para.getGridZ(), testing::ElementsAreArray({300, 301}));
-    EXPECT_THAT(para.getDistX(), testing::ElementsAreArray({400, 401}));
-    EXPECT_THAT(para.getDistY(), testing::ElementsAreArray({500, 501}));
-    EXPECT_THAT(para.getDistZ(), testing::ElementsAreArray({600, 601}));
+    EXPECT_THAT(para.getGridX(), testing::ElementsAreArray({ 100, 101 }));
+    EXPECT_THAT(para.getGridY(), testing::ElementsAreArray({ 200, 201 }));
+    EXPECT_THAT(para.getGridZ(), testing::ElementsAreArray({ 300, 301 }));
+    EXPECT_THAT(para.getDistX(), testing::ElementsAreArray({ 400, 401 }));
+    EXPECT_THAT(para.getDistY(), testing::ElementsAreArray({ 500, 501 }));
+    EXPECT_THAT(para.getDistZ(), testing::ElementsAreArray({ 600, 601 }));
 
     EXPECT_THAT(para.getMainKernel(), testing::Eq("KernelName"));
     EXPECT_THAT(para.getMultiKernelOn(), testing::Eq(true));
-    EXPECT_THAT(para.getMultiKernelLevel(), testing::ElementsAreArray({3, 2, 1}));
+    EXPECT_THAT(para.getMultiKernelLevel(), testing::ElementsAreArray({ 3, 2, 1 }));
 
-    std::vector<std::string> kernel {"Kernel1", "Kernel2", "Kernel3"};
+    std::vector<std::string> kernel{ "Kernel1", "Kernel2", "Kernel3" };
     auto kernel_actual = para.getMultiKernel();
     for (size_t i = 0; i < kernel.size(); ++i) {
         EXPECT_THAT(kernel_actual[i], testing::Eq(kernel[i]));
     }
 
-
     EXPECT_THAT(para.getCoarse(), testing::Eq(0));
-    EXPECT_THAT(para.getFine(), testing::Eq(1));  // NOGL - 1
+    EXPECT_THAT(para.getFine(), testing::Eq(1)); // NOGL - 1
     EXPECT_THAT(para.parH.size(), testing::Eq(2));
     EXPECT_THAT(para.parD.size(), testing::Eq(2));
 }
 
+static std::shared_ptr<Parameter> initParameterClass()
+{
+    std::filesystem::path filePath = __FILE__; //  assuming that the config file is stored parallel to this file.
+    filePath.replace_filename("parameterTest.cfg");
+    vf::basics::ConfigurationFile config;
+    config.load(filePath.string());
+    return std::make_shared<Parameter>(config, 1, 0);
+}
+
+static bool compareEdgeNodesRecv(std::vector<LBMSimulationParameter::EdgeNodePositions> &actual,
+                                 std::vector<std::pair<int, int>> &expected)
+{
+    for (int i = 0; i < (int)expected.size(); i++) {
+        if (actual[i].indexOfProcessNeighborRecv != expected[i].first) {
+            return false;
+        }
+        if (actual[i].indexInRecvBuffer != expected[i].second) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool compareEdgeNodesSend(std::vector<LBMSimulationParameter::EdgeNodePositions> &actual,
+                                 std::vector<std::pair<int, int>> &expected)
+{
+    for (int i = 0; i < (int)expected.size(); i++) {
+        if (actual[i].indexOfProcessNeighborSend != expected[i].first) {
+            return false;
+        }
+        if (actual[i].indexInSendBuffer != expected[i].second) {
+            return false;
+        }
+    }
+    return true;
+}
+
+class ParameterTest_findEdgeNodes : public testing::Test
+{
+protected:
+    std::shared_ptr<Parameter> para;
+    int level = 0;
+
+private:
+    void SetUp() override
+    {
+        para = initParameterClass();
+        para->initLBMSimulationParameter();
+    }
+};
+
+TEST_F(ParameterTest_findEdgeNodes, shouldReturnCorrectVectorForXY)
+{
+    para->parH[level]->recvProcessNeighborX.push_back(ProcessNeighbor27());
+    para->parH[level]->sendProcessNeighborY.push_back(ProcessNeighbor27());
+    para->parH[level]->sendProcessNeighborY.push_back(ProcessNeighbor27());
+
+    int numRecvNeighbor = (int)para->parH[level]->recvProcessNeighborX.size() - 1;
+    int numSendNeighbor = (int)para->parH[level]->sendProcessNeighborY.size() - 1;
+
+    const int sizeRecv                                                     = 6;
+    const int sizeSend                                                     = 10;
+    para->parH[level]->recvProcessNeighborX[numRecvNeighbor].numberOfNodes = sizeRecv;
+    para->parH[level]->sendProcessNeighborY[numSendNeighbor].numberOfNodes = sizeSend;
+
+    int recvNeighbors[sizeRecv]                                    = { 1, 2, 3, 4, 5, 6 };
+    para->parH[level]->recvProcessNeighborX[numRecvNeighbor].index = recvNeighbors;
 
+    int sendNeighbors[sizeSend]                                    = { 20, 1, 21, 22, 6, 23, 5, 24, 25, 26 };
+    para->parH[level]->sendProcessNeighborY[numSendNeighbor].index = sendNeighbors;
 
+    para->findEdgeNodesCommMultiGPU();
+
+    std::vector<std::pair<int, int>> expectedEdgeNodesXtoYRecv = { std::pair(numRecvNeighbor, 0),
+                                                                   std::pair(numRecvNeighbor, 4),
+                                                                   std::pair(numRecvNeighbor, 5) };
+
+    std::vector<std::pair<int, int>> expectedEdgeNodesXtoYSend = { std::pair(numSendNeighbor, 1),
+                                                                   std::pair(numSendNeighbor, 6),
+                                                                   std::pair(numSendNeighbor, 4) };
+
+    EXPECT_THAT(para->parH[level]->edgeNodesXtoY.size(), testing::Eq(expectedEdgeNodesXtoYRecv.size()));
+    EXPECT_TRUE(compareEdgeNodesRecv(para->parH[level]->edgeNodesXtoY, expectedEdgeNodesXtoYRecv))
+        << "the edgeNodesXtoY for the receive process do not match the expected nodes";
+    EXPECT_TRUE(compareEdgeNodesSend(para->parH[level]->edgeNodesXtoY, expectedEdgeNodesXtoYSend))
+        << "the edgeNodesXtoY for the send process do not match the expected nodes";
+}
+
+TEST_F(ParameterTest_findEdgeNodes, shouldReturnCorrectVectorForXZ)
+{
+    para->parH[level]->recvProcessNeighborX.push_back(ProcessNeighbor27());
+    para->parH[level]->sendProcessNeighborZ.push_back(ProcessNeighbor27());
+    para->parH[level]->sendProcessNeighborZ.push_back(ProcessNeighbor27());
+
+    int numRecvNeighbor = (int)para->parH[level]->recvProcessNeighborX.size() - 1;
+    int numSendNeighbor = (int)para->parH[level]->sendProcessNeighborZ.size() - 1;
+
+    const int sizeRecv                                                     = 10;
+    const int sizeSend                                                     = 6;
+    para->parH[level]->recvProcessNeighborX[numRecvNeighbor].numberOfNodes = sizeRecv;
+    para->parH[level]->sendProcessNeighborZ[numSendNeighbor].numberOfNodes = sizeSend;
+
+    int recvNeighbors[sizeRecv]                                    = { 20, 1, 21, 22, 6, 23, 5, 24, 25, 26 };
+    para->parH[level]->recvProcessNeighborX[numRecvNeighbor].index = recvNeighbors;
+
+    int sendNeighbors[sizeSend]                                    = { 1, 2, 3, 4, 5, 6 };
+    para->parH[level]->sendProcessNeighborZ[numSendNeighbor].index = sendNeighbors;
+
+    para->findEdgeNodesCommMultiGPU();
+
+    std::vector<std::pair<int, int>> expectedEdgeNodesXtoZRecv = { std::pair(numRecvNeighbor, 1),
+                                                                   std::pair(numRecvNeighbor, 4),
+                                                                   std::pair(numRecvNeighbor, 6) };
+    std::vector<std::pair<int, int>> expectedEdgeNodesXtoZSend = { std::pair(numSendNeighbor, 0),
+                                                                   std::pair(numSendNeighbor, 5),
+                                                                   std::pair(numSendNeighbor, 4) };
+
+    EXPECT_THAT(para->parH[level]->edgeNodesXtoZ.size(), testing::Eq(expectedEdgeNodesXtoZRecv.size()));
+    EXPECT_TRUE(compareEdgeNodesRecv(para->parH[level]->edgeNodesXtoZ, expectedEdgeNodesXtoZRecv))
+        << "the edgeNodesXtoZ for the receive process do not match the expected nodes";
+    EXPECT_TRUE(compareEdgeNodesSend(para->parH[level]->edgeNodesXtoZ, expectedEdgeNodesXtoZSend))
+        << "the edgeNodesXtoZ for the send process do not match the expected nodes";
+}
+
+TEST_F(ParameterTest_findEdgeNodes, shouldReturnCorrectVectorForYZ)
+{
+    para->parH[level]->recvProcessNeighborY.push_back(ProcessNeighbor27());
+    para->parH[level]->sendProcessNeighborZ.push_back(ProcessNeighbor27());
+    para->parH[level]->sendProcessNeighborZ.push_back(ProcessNeighbor27());
+
+    const int sizeRecv  = 10;
+    const int sizeSend1 = 6;
+    const int sizeSend2 = 5;
+
+    para->parH[level]->recvProcessNeighborY[0].numberOfNodes = sizeRecv;
+    para->parH[level]->sendProcessNeighborZ[0].numberOfNodes = sizeSend1;
+    para->parH[level]->sendProcessNeighborZ[1].numberOfNodes = sizeSend2;
+
+    int recvNeighbors[sizeRecv]                      = { 20, 1, 9, 22, 6, 23, 5, 24, 11, 26 };
+    para->parH[level]->recvProcessNeighborY[0].index = recvNeighbors;
+
+    int sendNeighbors1[sizeSend1]                    = { 1, 2, 3, 4, 5, 6 };
+    int sendNeighbors2[sizeSend2]                    = { 7, 8, 9, 10, 11 };
+    para->parH[level]->sendProcessNeighborZ[0].index = sendNeighbors1;
+    para->parH[level]->sendProcessNeighborZ[1].index = sendNeighbors2;
+
+    para->findEdgeNodesCommMultiGPU();
+
+    std::vector<std::pair<int, int>> expectedEdgeNodesYtoZRecv = { std::pair(0, 1), std::pair(0, 2), std::pair(0, 4),
+                                                                   std::pair(0, 6), std::pair(0, 8) };
+    std::vector<std::pair<int, int>> expectedEdgeNodesYtoZSend = { std::pair(0, 0), std::pair(1, 2), std::pair(0, 5),
+                                                                   std::pair(0, 4), std::pair(1, 4) };
+
+    EXPECT_THAT(para->parH[level]->edgeNodesYtoZ.size(), testing::Eq(expectedEdgeNodesYtoZRecv.size()));
+    EXPECT_TRUE(compareEdgeNodesRecv(para->parH[level]->edgeNodesYtoZ, expectedEdgeNodesYtoZRecv))
+        << "the edgeNodesYtoZ for the receive process do not match the expected nodes";
+    EXPECT_TRUE(compareEdgeNodesSend(para->parH[level]->edgeNodesYtoZ, expectedEdgeNodesYtoZSend))
+        << "the edgeNodesYtoZ for the send process do not match the expected nodes";
+}
\ No newline at end of file