diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a2e90d98acc29810ef98e96c7903b7363fda4233..2c6c35332b372379891de0c1ecc14640b48cbca8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -459,7 +459,9 @@ gcov_gcc_9:
       - coverage/
 
     reports:
-      cobertura: coverage/coverage.xml
+      coverage_report: 
+        coverage_format: cobertura
+        path: coverage/coverage.xml
 
   cache:
     key: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
diff --git a/Python/boundary_layer/__init__.py b/Python/boundary_layer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Python/boundary_layer/boundary_layer.py b/Python/boundary_layer/boundary_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf941a9418e5c3ec5d94864f119de20401601622
--- /dev/null
+++ b/Python/boundary_layer/boundary_layer.py
@@ -0,0 +1,108 @@
+#%%
+import numpy as np
+from pathlib import Path
+from mpi4py import MPI
+from pyfluids import basics, gpu, logger
+#%%
+reference_diameter = 126
+
+length = np.array([30,8,8])*reference_diameter
+viscosity = 1.56e-5
+velocity = 9
+mach = 0.1
+nodes_per_diameter = 32
+
+sim_name = "BoundaryLayer"
+config_file = Path(__file__).parent/Path("config.txt")
+output_path = Path(__file__).parent/Path("output")
+output_path.mkdir(exist_ok=True)
+timeStepOut = 500
+t_end = 50
+
+#%%
+logger.Logger.initialize_logger()
+basics.logger.Logger.add_stdout()
+basics.logger.Logger.set_debug_level(basics.logger.Level.INFO_LOW)
+basics.logger.Logger.time_stamp(basics.logger.TimeStamp.ENABLE)
+basics.logger.Logger.enable_printed_rank_numbers(True)
+#%%
+grid_builder = gpu.MultipleGridBuilder.make_shared()
+dx = reference_diameter/nodes_per_diameter
+
+grid_builder.add_coarse_grid(0.0, 0.0, 0.0, *length, dx)
+grid_builder.set_periodic_boundary_condition(False, False, False)
+grid_builder.build_grids(basics.LbmOrGks.LBM, False)
+# %%
+comm = gpu.Communicator.get_instance()
+#%%
+config = basics.ConfigurationFile()
+config.load(str(config_file))
+#%%
+para = gpu.Parameter(config, comm.get_number_of_process(), comm.get_pid())
+
+dt = dx * mach / (np.sqrt(3) * velocity)
+velocity_lb = velocity * dt / dx # LB units
+viscosity_lb = viscosity * dt / (dx * dx) # LB units
+
+#%%
+para.set_devices([0])
+para.set_output_prefix(sim_name)
+para.set_output_path(str(output_path))
+para.set_f_name(para.get_output_path() + "/" + para.get_output_prefix())
+para.set_print_files(True)
+para.set_max_level(1)
+#%%
+para.set_velocity(velocity_lb)
+para.set_viscosity(viscosity_lb)    
+para.set_velocity_ratio(dx/dt)
+para.set_main_kernel("CumulantK17CompChim")
+
+def init_func(coord_x, coord_y, coord_z):
+    return [0.0, velocity_lb, 0.0, 0.0]
+
+para.set_initial_condition(init_func)
+para.set_t_out(timeStepOut)
+para.set_t_end(int(t_end/dt))
+para.set_is_body_force(True)
+
+#%%
+grid_builder.set_velocity_boundary_condition(gpu.SideType.MX, velocity_lb, 0.0, 0.0)
+grid_builder.set_velocity_boundary_condition(gpu.SideType.PX, velocity_lb, 0.0, 0.0)
+
+grid_builder.set_velocity_boundary_condition(gpu.SideType.MY, velocity_lb, 0.0, 0.0)
+grid_builder.set_velocity_boundary_condition(gpu.SideType.PY, velocity_lb, 0.0, 0.0)
+
+grid_builder.set_velocity_boundary_condition(gpu.SideType.MZ, velocity_lb, 0.0, 0.0)
+grid_builder.set_velocity_boundary_condition(gpu.SideType.PZ, velocity_lb, 0.0, 0.0)
+
+#%%
+cuda_memory_manager = gpu.CudaMemoryManager.make(para)
+grid_generator = gpu.GridProvider.make_grid_generator(grid_builder, para, cuda_memory_manager)
+#%%
+turb_pos = np.array([3,3,3])*reference_diameter
+epsilon = 5
+density = 1.225
+level = 0
+n_blades = 3
+n_blade_nodes = 32
+alm = gpu.ActuatorLine(n_blades, density, n_blade_nodes, epsilon, *turb_pos, reference_diameter, level, dt, dx)
+para.add_actuator(alm)
+#%%
+point_probe = gpu.probes.PointProbe("pointProbe", str(output_path), 100, 500, 100)
+point_probe.add_probe_points_from_list(np.array([1,2,5])*reference_diameter, np.array([3,3,3])*reference_diameter, np.array([3,3,3])*reference_diameter)
+point_probe.add_post_processing_variable(gpu.probes.PostProcessingVariable.Means)
+
+para.add_probe(point_probe)
+
+plane_probe = gpu.probes.PlaneProbe("planeProbe", str(output_path), 100, 500, 100)
+plane_probe.set_probe_plane(5*reference_diameter, 0, 0, dx, length[1], length[2])
+para.add_probe(plane_probe)
+#%%
+sim = gpu.Simulation(comm)
+kernel_factory = gpu.KernelFactory.get_instance()
+sim.set_factories(kernel_factory, gpu.PreProcessorFactory.get_instance())
+sim.init(para, grid_generator, gpu.FileWriter(), cuda_memory_manager)
+#%%
+sim.run()
+sim.free()
+MPI.Finalize()
\ No newline at end of file
diff --git a/Python/boundary_layer/config.txt b/Python/boundary_layer/config.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e4c778c4cc048f54c0a32310e6bf4a7343a263fa
--- /dev/null
+++ b/Python/boundary_layer/config.txt
@@ -0,0 +1,2 @@
+Path = .
+GridPath = .
diff --git a/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp b/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
index 6f22d023925ad68bfb9bd5b14f845813cad71105..6be64950710c53b3c7931180a9beb1368a615fe3 100644
--- a/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
+++ b/apps/gpu/LBM/ActuatorLine/ActuatorLine.cpp
@@ -191,7 +191,7 @@ void multipleLevel(const std::string& configPath)
     SPtr<ActuatorLine> actuator_line =SPtr<ActuatorLine>( new ActuatorLine(nBlades, density, nBladeNodes, epsilon, turbPos[0], turbPos[1], turbPos[2], reference_diameter, level, dt, dx) );
     para->addActuator( actuator_line );
 
-    SPtr<PointProbe> pointProbe = SPtr<PointProbe>( new PointProbe("pointProbe", para->getOutputPath(), 100, 500, 100) );
+    SPtr<PointProbe> pointProbe = SPtr<PointProbe>( new PointProbe("pointProbe", para->getOutputPath(), 100, 1, 500, 100) );
     std::vector<real> probeCoordsX = {reference_diameter,2*reference_diameter,5*reference_diameter};
     std::vector<real> probeCoordsY = {3*reference_diameter,3*reference_diameter,3*reference_diameter};
     std::vector<real> probeCoordsZ = {3*reference_diameter,3*reference_diameter,3*reference_diameter};
diff --git a/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp b/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c440bd14cf46ca8dae8013b5c0a480109924f7c4
--- /dev/null
+++ b/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
@@ -0,0 +1,270 @@
+
+#define _USE_MATH_DEFINES
+#include <math.h>
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <stdexcept>
+#include <fstream>
+#include <exception>
+#include <memory>
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "Core/DataTypes.h"
+#include "PointerDefinitions.h"
+
+#include "Core/StringUtilities/StringUtil.h"
+
+#include "Core/VectorTypes.h"
+
+#include <basics/config/ConfigurationFile.h>
+
+#include <logger/Logger.h>
+
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "GridGenerator/grid/GridBuilder/LevelGridBuilder.h"
+#include "GridGenerator/grid/GridBuilder/MultipleGridBuilder.h"
+#include "GridGenerator/grid/BoundaryConditions/Side.h"
+#include "GridGenerator/grid/GridFactory.h"
+
+#include "GridGenerator/io/SimulationFileWriter/SimulationFileWriter.h"
+#include "GridGenerator/io/GridVTKWriter/GridVTKWriter.h"
+#include "GridGenerator/io/STLReaderWriter/STLReader.h"
+#include "GridGenerator/io/STLReaderWriter/STLWriter.h"
+
+//////////////////////////////////////////////////////////////////////////
+
+#include "VirtualFluids_GPU/LBM/Simulation.h"
+#include "VirtualFluids_GPU/Communication/Communicator.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridProvider.h"
+#include "VirtualFluids_GPU/DataStructureInitializer/GridReaderFiles/GridReader.h"
+#include "VirtualFluids_GPU/Parameter/Parameter.h"
+#include "VirtualFluids_GPU/Output/FileWriter.h"
+#include "VirtualFluids_GPU/PreCollisionInteractor/ActuatorLine.h"
+#include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h"
+#include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h"
+#include "VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.h"
+#include "VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.h"
+
+#include "VirtualFluids_GPU/Kernel/Utilities/KernelFactory/KernelFactoryImp.h"
+#include "VirtualFluids_GPU/PreProcessor/PreProcessorFactory/PreProcessorFactoryImp.h"
+
+#include "VirtualFluids_GPU/GPU/CudaMemoryManager.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::string path(".");
+
+std::string simulationName("BoundayLayer");
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+void multipleLevel(const std::string& configPath)
+{
+
+    logging::Logger::addStream(&std::cout);
+    logging::Logger::setDebugLevel(logging::Logger::Level::INFO_LOW);
+    logging::Logger::timeStamp(logging::Logger::ENABLE);
+    logging::Logger::enablePrintedRankNumbers(logging::Logger::ENABLE);
+    
+    auto gridFactory = GridFactory::make();
+    auto gridBuilder = MultipleGridBuilder::makeShared(gridFactory);
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    vf::gpu::Communicator& communicator = vf::gpu::Communicator::getInstance();
+
+    vf::basics::ConfigurationFile config;
+    config.load(configPath);
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////^
+    SPtr<Parameter> para = std::make_shared<Parameter>(config, communicator.getNummberOfProcess(), communicator.getPID());
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    //
+    //          U s e r    s e t t i n g s
+    //
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    LbmOrGks lbmOrGks = LBM;
+
+    const real H = 1000.0; // boundary layer height in m
+
+    const real L_x = 6*H;
+    const real L_y = 4*H;
+    const real L_z = 1*H;
+
+    const real z0  = 0.1; // roughness length in m
+    const real u_star = 0.4; //friction velocity in m/s
+    const real kappa = 0.4; // von Karman constant 
+
+    const real viscosity = 1.56e-5;
+
+    const real velocity  = 0.5*u_star/kappa*log(L_z/z0); //0.5 times max mean velocity at the top in m/s
+
+    const real mach = config.contains("Ma")? config.getValue<real>("Ma"): 0.1;
+
+    const uint nodes_per_H = config.contains("nz")? config.getValue<uint>("nz"): 64;
+
+    // all in s
+    const float tStartOut   = config.getValue<real>("tStartOut");
+    const float tOut        = config.getValue<real>("tOut");
+    const float tEnd        = config.getValue<real>("tEnd"); // total time of simulation
+
+    const float tStartAveraging     =  config.getValue<real>("tStartAveraging");
+    const float tStartTmpAveraging  =  config.getValue<real>("tStartTmpAveraging");
+    const float tAveraging          =  config.getValue<real>("tAveraging");
+    const float tStartOutProbe      =  config.getValue<real>("tStartOutProbe");
+    const float tOutProbe           =  config.getValue<real>("tOutProbe"); 
+
+
+    const real dx = L_z/real(nodes_per_H);
+
+    const real dt = dx * mach / (sqrt(3) * velocity);
+
+    const real velocityLB = velocity * dt / dx; // LB units
+
+    const real viscosityLB = viscosity * dt / (dx * dx); // LB units
+
+    const real pressureGradient = u_star * u_star / H ;
+    const real pressureGradientLB = pressureGradient * (dt*dt)/dx; // LB units
+
+    VF_LOG_INFO("velocity  [dx/dt] = {}", velocityLB);
+    VF_LOG_INFO("dt   = {}", dt);
+    VF_LOG_INFO("dx   = {}", dx);
+    VF_LOG_INFO("viscosity [10^8 dx^2/dt] = {}", viscosityLB*1e8);
+    VF_LOG_INFO("u* /(dx/dt) = {}", u_star*dt/dx);
+    VF_LOG_INFO("dpdx  = {}", pressureGradient);
+    VF_LOG_INFO("dpdx /(dx/dt^2) = {}", pressureGradientLB);
+
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    para->setOutputPrefix( simulationName );
+
+    para->setFName(para->getOutputPath() + "/" + para->getOutputPrefix());
+
+    para->setPrintFiles(true);
+
+    para->setForcing(pressureGradientLB, 0, 0);
+    para->setVelocity(velocityLB);
+    para->setViscosity(viscosityLB);
+    para->setVelocityRatio( dx / dt );
+    para->setViscosityRatio( dx*dx/dt );
+    para->setDensityRatio( 1.0 );
+
+    if(para->getUseAMD())
+        para->setMainKernel("TurbulentViscosityCumulantK17CompChim");
+    else 
+        para->setMainKernel("CumulantK17CompChim");
+    
+    para->setIsBodyForce( config.getValue<bool>("bodyForce") );
+
+    para->setTStartOut(uint(tStartOut/dt) );
+    para->setTOut( uint(tOut/dt) );
+    para->setTEnd( uint(tEnd/dt) );
+
+    // para->setTOut( 100 );
+    // para->setTEnd( 100000 );
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    gridBuilder->addCoarseGrid(0.0, 0.0, 0.0,
+                                L_x,  L_y,  L_z, dx);
+    // gridBuilder->setNumberOfLayers(0,0);
+    // gridBuilder->addGrid( new Cuboid( 300., 300., 300., 1000. , 1000., 600.), 1 );
+
+    gridBuilder->setPeriodicBoundaryCondition(true, true, false);
+
+	gridBuilder->buildGrids(lbmOrGks, false); // buildGrids() has to be called before setting the BCs!!!!
+
+    uint samplingOffset = 2;
+    // gridBuilder->setVelocityBoundaryCondition(SideType::MZ, 0.0, 0.0, 0.0);
+    gridBuilder->setStressBoundaryCondition(SideType::MZ, 
+                                            0.0, 0.0, 1.0,              // wall normals
+                                            samplingOffset, z0/dx);     // wall model settinng
+    para->setHasWallModelMonitor(true);
+
+
+    // gridBuilder->setVelocityBoundaryCondition(SideType::PZ, 0.0, 0.0, 0.0);
+    gridBuilder->setSlipBoundaryCondition(SideType::PZ,  0.0,  0.0, 0.0);
+
+    real cPi = 3.1415926535897932384626433832795;
+    para->setInitialCondition([&](real coordX, real coordY, real coordZ, real &rho, real &vx, real &vy, real &vz) {
+        rho = (real)0.0;
+        vx  = (u_star/0.4 * log(coordZ/z0) + 2.0*sin(cPi*16.0f*coordX/L_x)*sin(cPi*8.0f*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1))  * dt / dx; 
+        vy  =  2.0*sin(cPi*16.0f*coordX/L_x)*sin(cPi*8.0f*coordZ/H)/(pow(coordZ/H,c2o1)+c1o1)  * dt / dx; 
+        vz  = 8.0*u_star/0.4*(sin(cPi*8.0*coordY/H)*sin(cPi*8.0*coordZ/H)+sin(cPi*8.0*coordX/L_x))/(pow(L_z/2.0-coordZ, c2o1)+c1o1) * dt / dx;
+    });
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    SPtr<CudaMemoryManager> cudaMemoryManager = CudaMemoryManager::make(para);
+
+    SPtr<GridProvider> gridGenerator = GridProvider::makeGridGenerator(gridBuilder, para, cudaMemoryManager);
+
+    SPtr<PlanarAverageProbe> planarAverageProbe = SPtr<PlanarAverageProbe>( new PlanarAverageProbe("planeProbe", para->getOutputPath(), tStartAveraging/dt, tStartTmpAveraging/dt, tAveraging/dt , tStartOutProbe/dt, tOutProbe/dt, 'z') );
+    planarAverageProbe->addAllAvailableStatistics();
+    planarAverageProbe->setFileNameToNOut();
+    para->addProbe( planarAverageProbe );
+
+    para->setHasWallModelMonitor(true);
+    SPtr<WallModelProbe> wallModelProbe = SPtr<WallModelProbe>( new WallModelProbe("wallModelProbe", para->getOutputPath(), tStartAveraging/dt, tStartTmpAveraging/dt, tAveraging/dt/4.0 , tStartOutProbe/dt, tOutProbe/dt) );
+    wallModelProbe->addAllAvailableStatistics();
+    wallModelProbe->setFileNameToNOut();
+    wallModelProbe->setForceOutputToStress(true);
+    if(para->getIsBodyForce())
+        wallModelProbe->setEvaluatePressureGradient(true);
+    para->addProbe( wallModelProbe );
+
+    Simulation sim(communicator);
+    SPtr<FileWriter> fileWriter = SPtr<FileWriter>(new FileWriter());
+    SPtr<KernelFactoryImp> kernelFactory = KernelFactoryImp::getInstance();
+    SPtr<PreProcessorFactoryImp> preProcessorFactory = PreProcessorFactoryImp::getInstance();
+    sim.setFactories(kernelFactory, preProcessorFactory);
+    sim.init(para, gridGenerator, fileWriter, cudaMemoryManager);        
+    sim.run();
+    sim.free();
+}
+
+int main( int argc, char* argv[])
+{
+    if ( argv != NULL )
+    {
+        try
+        {
+            vf::logging::Logger::initalizeLogger();
+
+            if( argc > 1){ path = argv[1]; }
+
+            multipleLevel(path + "/configBoundaryLayer.txt");
+        }
+        catch (const spdlog::spdlog_ex &ex) {
+            std::cout << "Log initialization failed: " << ex.what() << std::endl;
+        }
+
+        catch (const std::bad_alloc& e)
+        { 
+            VF_LOG_CRITICAL("Bad Alloc: {}", e.what());
+        }
+        catch (const std::exception& e)
+        {   
+            VF_LOG_CRITICAL("exception: {}", e.what());
+        }
+        catch (...)
+        {
+            VF_LOG_CRITICAL("Unknown exception!");
+        }
+    }
+    return 0;
+}
diff --git a/apps/gpu/LBM/BoundaryLayer/CMakeLists.txt b/apps/gpu/LBM/BoundaryLayer/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..801b634803943d48abda690935df0867eb3418d2
--- /dev/null
+++ b/apps/gpu/LBM/BoundaryLayer/CMakeLists.txt
@@ -0,0 +1,7 @@
+PROJECT(BoundaryLayer LANGUAGES CUDA CXX)
+
+vf_add_library(BUILDTYPE binary PRIVATE_LINK basics VirtualFluids_GPU GridGenerator MPI::MPI_CXX FILES BoundaryLayer.cpp)
+
+set_source_files_properties(BoundaryLayer.cpp PROPERTIES LANGUAGE CUDA)
+
+set_target_properties(BoundaryLayer PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
diff --git a/apps/gpu/LBM/BoundaryLayer/configBoundaryLayer.txt b/apps/gpu/LBM/BoundaryLayer/configBoundaryLayer.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a489f0ab89738a193b16fee41c212a5943f6525d
--- /dev/null
+++ b/apps/gpu/LBM/BoundaryLayer/configBoundaryLayer.txt
@@ -0,0 +1,30 @@
+##################################################
+#informations for Writing
+##################################################
+Path = .
+##################################################
+#informations for reading
+##################################################
+GridPath = .
+##################################################
+Devices = 1 
+##################################################
+tStartOut           = 0
+tOut                = 100000
+tEnd                = 300000
+##################################################
+tStartAveraging     = 0
+tStartTmpAveraging  = 100000
+tAveraging          = 200
+tStartOutProbe      = 0
+tOutProbe           = 1000 
+##################################################
+Ma = 0.1
+nz = 96 
+
+bodyForce = true
+UseAMD = true
+SGSconstant = 0.2
+QuadricLimiterP = 100000.0
+QuadricLimiterM = 100000.0
+QuadricLimiterD = 100000.0
diff --git a/gpu.cmake b/gpu.cmake
index 44c3ce9ab3eb8d99ed8ede0ddc58bfe4112b78dd..4a1b1a9eb070dcb85ff0c4147fa3b272372a2da9 100644
--- a/gpu.cmake
+++ b/gpu.cmake
@@ -37,6 +37,7 @@ IF (BUILD_VF_GPU)
     #add_subdirectory(apps/gpu/LBM/TGV_3D)
     #add_subdirectory(apps/gpu/LBM/TGV_3D_MultiGPU)
     #add_subdirectory(apps/gpu/LBM/ActuatorLine)
+    add_subdirectory(apps/gpu/LBM/BoundaryLayer)
 ELSE()
     MESSAGE( STATUS "exclude Virtual Fluids GPU." )
 ENDIF()
@@ -130,4 +131,4 @@ endif()
 if(BUILD_VF_TRAFFIC)
     add_subdirectory(src/gpu/Traffic)
     add_subdirectory(apps/gpu/LBM/TrafficTest)
-endif()
\ No newline at end of file
+endif()
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp
index 8930bdf3b165b4e0dbb497773fd0b6cf6ec6f8f7..5102f60fc295aadf4323a4b332bf3dd8f7f21dbf 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.cpp
@@ -42,6 +42,8 @@ bool gg::BoundaryCondition::isSide( SideType side ) const
     return this->side->whoAmI() == side;
 }
 
+//////////////////////////////////////////////////////////////////////////
+
 void VelocityBoundaryCondition::setVelocityProfile(
     SPtr<Grid> grid, std::function<void(real, real, real, real &, real &, real &)> velocityProfile)
 {
@@ -55,6 +57,8 @@ void VelocityBoundaryCondition::setVelocityProfile(
     }
 }
 
+//////////////////////////////////////////////////////////////////////////
+
 void GeometryBoundaryCondition::setTangentialVelocityForPatch(SPtr<Grid> grid, uint patch, 
                                                               real p1x, real p1y, real p1z, 
                                                               real p2x, real p2y, real p2z, 
@@ -102,3 +106,23 @@ void GeometryBoundaryCondition::setTangentialVelocityForPatch(SPtr<Grid> grid, u
         }
     }
 }
+
+//////////////////////////////////////////////////////////////////////////
+
+void StressBoundaryCondition::fillSamplingIndices(std::vector<SPtr<Grid> > grid, uint level, uint samplingOffset)
+{
+
+    for( uint i = 0; i < this->indices.size(); i++ )
+    {
+        real x, y, z;
+        grid[level]->transIndexToCoords(this->indices[i], x, y, z);
+
+        real x_sampling = x + this->getNormalx(i)*samplingOffset*grid[level]->getDelta();
+        real y_sampling = y + this->getNormaly(i)*samplingOffset*grid[level]->getDelta();
+        real z_sampling = z + this->getNormalz(i)*samplingOffset*grid[level]->getDelta();
+
+        this->velocitySamplingIndices.push_back( grid[level]->transCoordToIndex(x_sampling, y_sampling, z_sampling) );
+    }
+    
+}
+
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h
index 9ae5f09e208e92213ca90ff75f095eddd5dbeaf1..8ea4c7ea6e37be1fd5ef8dbd1685f55b1ad549e0 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/BoundaryCondition.h
@@ -118,6 +118,52 @@ public:
     }
 
     void fillSlipNormalLists()
+    {   
+        for (uint index : this->indices) {
+            (void)index;
+            this->normalXList.push_back(normalX);
+            this->normalYList.push_back(normalY);
+            this->normalZList.push_back(normalZ);
+        }
+    }
+
+    real getNormalx() { return this->normalX; }
+    real getNormaly() { return this->normalY; }
+    real getNormalz() { return this->normalZ; }
+
+    real getNormalx(uint index) { return this->normalXList[index]; }
+    real getNormaly(uint index) { return this->normalYList[index]; }
+    real getNormalz(uint index) { return this->normalZList[index]; }
+};
+
+//////////////////////////////////////////////////////////////////////////
+
+class StressBoundaryCondition : public gg::BoundaryCondition
+{
+public:
+    static SPtr<StressBoundaryCondition> make(real normalX, real normalY, real normalZ, uint samplingOffset, real z0)
+    {
+        return SPtr<StressBoundaryCondition>(new StressBoundaryCondition(normalX, normalY, normalZ, samplingOffset, z0));
+    }
+
+    real normalX, normalY, normalZ;
+    uint samplingOffset;
+    real z0;
+    std::vector<real> normalXList, normalYList, normalZList;
+    std::vector<uint> samplingOffsetList;
+    std::vector<real> z0List;
+    std::vector<uint> velocitySamplingIndices;
+
+protected:
+    StressBoundaryCondition(real normalX, real normalY, real normalZ, uint samplingOffset, real z0) :   normalX(normalX), normalY(normalY), normalZ(normalZ), samplingOffset(samplingOffset), z0(z0){ }
+
+public:
+    virtual char getType() const override
+    {
+        return vf::gpu::BC_STRESS;
+    }
+    
+    void fillStressNormalLists()
     {
         for (uint index : this->indices) {
             (void)index;
@@ -127,6 +173,22 @@ public:
         }
     }
 
+    void fillZ0Lists()
+    {
+        for (uint index : this->indices) {
+            (void)index;
+            this->z0List.push_back(z0);
+        }
+    }
+
+    void fillSamplingOffsetLists()
+    {
+        for (uint index : this->indices) {
+            (void)index;
+            this->samplingOffsetList.push_back(samplingOffset);
+        }
+    }
+
     real getNormalx() { return this->normalX; }
     real getNormaly() { return this->normalY; }
     real getNormalz() { return this->normalZ; }
@@ -134,6 +196,15 @@ public:
     real getNormalx(uint index) { return this->normalXList[index]; }
     real getNormaly(uint index) { return this->normalYList[index]; }
     real getNormalz(uint index) { return this->normalZList[index]; }
+
+    uint getSamplingOffset() { return this->samplingOffset; }
+    uint getSamplingOffset(uint index) { return this->samplingOffsetList[index]; }
+
+    real getZ0() { return this->z0; }
+    real getZ0(uint index) { return this->z0List[index]; }
+
+    void fillSamplingIndices(std::vector<SPtr<Grid> > grid, uint level, uint samplingOffset);
+
 };
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
index f76844c134cdc4117d010f8f7f667640d38cc2e2..6c7bf8ca1853826d83fb6a713ffe03716bd2cf9a 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
@@ -53,16 +53,18 @@ void Side::addIndices(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition
                                             || grid->getFieldEntry(index) == vf::gpu::FLUID_CFC
                                             || grid->getFieldEntry(index) == vf::gpu::FLUID_CFF
                                             || grid->getFieldEntry(index) == vf::gpu::FLUID_FCC
-                                            || grid->getFieldEntry(index) == vf::gpu::FLUID_FCF ) )
+                                            || grid->getFieldEntry(index) == vf::gpu::FLUID_FCF ))
             {
                 grid->setFieldEntry(index, boundaryCondition->getType());
                 boundaryCondition->indices.push_back(index);
                 setPressureNeighborIndices(boundaryCondition, grid, index);
+                setStressSamplingIndices(boundaryCondition, grid, index);
 
                 setQs(grid, boundaryCondition, index);
 
                 boundaryCondition->patches.push_back(0);
             }
+
         }
     }
 }
@@ -91,6 +93,30 @@ void Side::setPressureNeighborIndices(SPtr<BoundaryCondition> boundaryCondition,
     }
 }
 
+void Side::setStressSamplingIndices(SPtr<BoundaryCondition> boundaryCondition, SPtr<Grid> grid, const uint index)
+{
+    auto stressBoundaryCondition = std::dynamic_pointer_cast<StressBoundaryCondition>(boundaryCondition);
+    if (stressBoundaryCondition)
+    {
+        real x, y, z;
+        grid->transIndexToCoords(index, x, y, z);
+
+        real nx = x;
+        real ny = y;
+        real nz = z;
+
+        if (boundaryCondition->side->getCoordinate() == X_INDEX)
+            nx = -boundaryCondition->side->getDirection() * stressBoundaryCondition->samplingOffset * grid->getDelta() + x;
+        if (boundaryCondition->side->getCoordinate() == Y_INDEX)
+            ny = -boundaryCondition->side->getDirection() * stressBoundaryCondition->samplingOffset * grid->getDelta() + y;
+        if (boundaryCondition->side->getCoordinate() == Z_INDEX)
+            nz = -boundaryCondition->side->getDirection() * stressBoundaryCondition->samplingOffset * grid->getDelta() + z;
+
+        uint samplingIndex = grid->transCoordToIndex(nx, ny, nz);
+        stressBoundaryCondition->velocitySamplingIndices.push_back(samplingIndex);
+    }
+}
+
 void Side::setQs(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition, uint index)
 {
 
@@ -133,6 +159,7 @@ void Side::setQs(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition, uin
             qNode[dir] = 0.5;
         else
             qNode[dir] = -1.0;
+
     }
 
     boundaryCondition->qs.push_back(qNode);
@@ -280,6 +307,6 @@ void PZ::addIndices(std::vector<SPtr<Grid> > grid, uint level, SPtr<BoundaryCond
     real coordinateNormal = grid[level]->getEndZ() - grid[level]->getDelta();
 
     if( coordinateNormal < grid[0]->getEndZ() - grid[0]->getDelta() ) return;
-
+    
     Side::addIndices(grid[level], boundaryCondition, "z", coordinateNormal, startInner, endInner, startOuter, endOuter);
 }
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h
index d4c9e3a4bcab73d368c863ee57d66f692126fa06..c9ffd40b0aa8fc2b8da8b4d85de60faea6927117 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.h
@@ -78,6 +78,8 @@ protected:
 
     static void setPressureNeighborIndices(SPtr<gg::BoundaryCondition> boundaryCondition, SPtr<Grid> grid, const uint index);
 
+    static void setStressSamplingIndices(SPtr<gg::BoundaryCondition> boundaryCondition, SPtr<Grid> grid, const uint index);
+
     static void setQs(SPtr<Grid> grid, SPtr<gg::BoundaryCondition> boundaryCondition, uint index);
 
 private:
diff --git a/src/gpu/GridGenerator/grid/Field.cpp b/src/gpu/GridGenerator/grid/Field.cpp
index d8ac2a80ea6fc5da879c5378aac2eab70016ff72..86985af60e1ca25c247b586dbc2f356c665a8875 100644
--- a/src/gpu/GridGenerator/grid/Field.cpp
+++ b/src/gpu/GridGenerator/grid/Field.cpp
@@ -130,7 +130,7 @@ bool Field::isQ(uint index) const
 
 bool Field::isBoundaryConditionNode(uint index) const
 {
-    return  field[index] == BC_SOLID || field[index] == BC_OUTFLOW || field[index] == BC_VELOCITY || field[index] == BC_PRESSURE || field[index] == BC_SLIP;
+    return  field[index] == BC_SOLID || field[index] == BC_OUTFLOW || field[index] == BC_VELOCITY || field[index] == BC_PRESSURE || field[index] == BC_SLIP || field[index] == BC_STRESS;
 }
 
 // --------------------------------------------------------- //
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
index 6ab8efc88d02e1032c2d26c756e84d4fa33359ac..a5ee3943f23ed4e9ffa1acb92ffc525e9de7780c 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/GridBuilder.h
@@ -98,6 +98,13 @@ public:
     virtual void getSlipValues(real *normalX, real *normalY, real *normalZ, int *indices, int level) const = 0;
     virtual void getSlipQs(real* qs[27], int level) const = 0;
 
+    virtual uint getStressSize(int level) const = 0;
+    virtual void getStressValues(real *normalX, real *normalY, real *normalZ, 
+                                real* vx1,     real* vy1,     real* vz1, 
+                                real* vx, real* vy, real* vz, 
+                                int *indices, int* samplingIndices, int*        samplingOffsets, real* z0, int level) const = 0;
+    virtual void getStressQs(real* qs[27], int level) const = 0;
+
     virtual uint getVelocitySize(int level) const = 0;
     virtual void getVelocityValues(real* vx, real* vy, real* vz, int* indices, int level) const = 0;
     virtual void getVelocityQs(real* qs[27], int level) const = 0;
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
index 0b67fe275492cebe8bb519052c51ed0157167194..30156a7c65ffff00fec92ec1d8a7644236756488 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.cpp
@@ -28,7 +28,7 @@
 //
 //! \file LevelGridBuilder.cpp
 //! \ingroup grid
-//! \author Soeren Peters, Stephan Lenz, Martin Schönherr
+//! \author Soeren Peters, Stephan Lenz, Martin Sch�nherr
 //=======================================================================================
 #include "LevelGridBuilder.h"
 
@@ -84,12 +84,32 @@ void LevelGridBuilder::setSlipBoundaryCondition(SideType sideType, real nomalX,
     slipBoundaryCondition->side->addIndices(grids, 0, slipBoundaryCondition);
 
     slipBoundaryCondition->fillSlipNormalLists();
-
     boundaryConditions[0]->slipBoundaryConditions.push_back(slipBoundaryCondition);
 
     *logging::out << logging::Logger::INFO_INTERMEDIATE << "Set Slip BC on level " << 0 << " with " << (int)slipBoundaryCondition->indices.size() << "\n";
 }
 
+void LevelGridBuilder::setStressBoundaryCondition(  SideType sideType, 
+                                                    real nomalX, real normalY, real normalZ, 
+                                                    uint samplingOffset, real z0)
+{
+    SPtr<StressBoundaryCondition> stressBoundaryCondition = StressBoundaryCondition::make(nomalX, normalY, normalZ, samplingOffset, z0);
+
+    auto side = SideFactory::make(sideType);
+
+    stressBoundaryCondition->side = side;
+    stressBoundaryCondition->side->addIndices(grids, 0, stressBoundaryCondition);
+
+    stressBoundaryCondition->fillStressNormalLists();
+    stressBoundaryCondition->fillSamplingOffsetLists();
+    stressBoundaryCondition->fillZ0Lists();
+    // stressBoundaryCondition->fillSamplingIndices(grids, 0, samplingOffset); //redundant with Side::setStressSamplingIndices but potentially a better approach for cases with complex geometries
+
+    boundaryConditions[0]->stressBoundaryConditions.push_back(stressBoundaryCondition);
+
+    *logging::out << logging::Logger::INFO_INTERMEDIATE << "Set Stress BC on level " << 0 << " with " << (int)stressBoundaryCondition->indices.size() << "\n";
+}
+
 void LevelGridBuilder::setVelocityBoundaryCondition(SideType sideType, real vx, real vy, real vz)
 {
     if (sideType == SideType::GEOMETRY)
@@ -167,7 +187,9 @@ void LevelGridBuilder::setNoSlipBoundaryCondition(SideType sideType)
         noSlipBoundaryCondition->side = side;
         noSlipBoundaryCondition->side->addIndices(grids, level, noSlipBoundaryCondition);
 
-        boundaryConditions[level]->noSlipBoundaryConditions.push_back(noSlipBoundaryCondition);
+        noSlipBoundaryCondition->fillVelocityLists();
+
+        boundaryConditions[level]->velocityBoundaryConditions.push_back(noSlipBoundaryCondition); //now effectively just a wrapper for velocityBC with zero velocity. No distinction in Gridgenerator.
     }
 }
 
@@ -341,7 +363,7 @@ void LevelGridBuilder::getSlipValues(real* normalX, real* normalY, real* normalZ
         for (uint index = 0; index < boundaryCondition->indices.size(); index++)
         {
             indices[allIndicesCounter] = grids[level]->getSparseIndex(boundaryCondition->indices[index]) + 1;
-
+            
             normalX[allIndicesCounter] = boundaryCondition->getNormalx(index);
             normalY[allIndicesCounter] = boundaryCondition->getNormaly(index);
             normalZ[allIndicesCounter] = boundaryCondition->getNormalz(index);
@@ -366,6 +388,57 @@ void LevelGridBuilder::getSlipQs(real* qs[27], int level) const
     }
 }
 
+uint LevelGridBuilder::getStressSize(int level) const
+{
+    uint size = 0;
+    for (auto boundaryCondition : boundaryConditions[level]->stressBoundaryConditions)
+    {
+        size += uint(boundaryCondition->indices.size());
+    }
+    return size;
+}
+
+void LevelGridBuilder::getStressValues( real* normalX, real* normalY, real* normalZ, 
+                                        real* vx,      real* vy,      real* vz, 
+                                        real* vx1,     real* vy1,     real* vz1, 
+                                        int* indices, int* samplingIndices, int* samplingOffset, real* z0, int level) const
+{
+
+    int allIndicesCounter = 0;
+    for (auto boundaryCondition : boundaryConditions[level]->stressBoundaryConditions)
+    {
+        for (uint index = 0; index < boundaryCondition->indices.size(); index++)
+        {
+            indices[allIndicesCounter]          = grids[level]->getSparseIndex(boundaryCondition->indices[index]) + 1;
+            samplingIndices[allIndicesCounter]  = grids[level]->getSparseIndex(boundaryCondition->velocitySamplingIndices[index]) + 1;
+
+            normalX[allIndicesCounter] = boundaryCondition->getNormalx(index);
+            normalY[allIndicesCounter] = boundaryCondition->getNormaly(index);
+            normalZ[allIndicesCounter] = boundaryCondition->getNormalz(index);
+
+            samplingOffset[allIndicesCounter] = boundaryCondition->getSamplingOffset(index);
+            z0[allIndicesCounter] = boundaryCondition->getZ0(index);
+            allIndicesCounter++;
+        }
+    }
+}
+
+void LevelGridBuilder::getStressQs(real* qs[27], int level) const
+{
+    int allIndicesCounter = 0;
+    for (auto boundaryCondition : boundaryConditions[level]->stressBoundaryConditions)
+    {
+        for (uint index = 0; index < boundaryCondition->indices.size(); index++)
+        {
+            for (int dir = 0; dir <= grids[level]->getEndDirection(); dir++)
+            {
+                qs[dir][allIndicesCounter] = boundaryCondition->qs[index][dir];
+            }
+            allIndicesCounter++;
+        }
+    }
+}
+
 uint LevelGridBuilder::getVelocitySize(int level) const
 {
     uint size = 0;
diff --git a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
index f2325435d99140f33eee9844c13908de87788558..f3d21cf130aaaf5caac78c8828f35951ebd4e510 100644
--- a/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
+++ b/src/gpu/GridGenerator/grid/GridBuilder/LevelGridBuilder.h
@@ -28,7 +28,7 @@
 //
 //! \file LevelGridBuilder.h
 //! \ingroup grid
-//! \author Soeren Peters, Stephan Lenz, Martin Schönherr
+//! \author Soeren Peters, Stephan Lenz, Martin Sch�nherr
 //=======================================================================================
 #ifndef LEVEL_GRID_BUILDER_H
 #define LEVEL_GRID_BUILDER_H
@@ -54,6 +54,7 @@ class BoundingBox;
 class Side;
 class VelocityBoundaryCondition;
 class SlipBoundaryCondition;
+class StressBoundaryCondition;
 class PressureBoundaryCondition;
 class GeometryBoundaryCondition;
 enum class SideType;
@@ -73,6 +74,7 @@ public:
     GRIDGENERATOR_EXPORT virtual ~LevelGridBuilder();
 
     GRIDGENERATOR_EXPORT void setSlipBoundaryCondition(SideType sideType, real nomalX, real normalY, real normalZ);
+    GRIDGENERATOR_EXPORT void setStressBoundaryCondition(SideType sideType, real nomalX, real normalY, real normalZ, uint samplingOffset, real z0);
     GRIDGENERATOR_EXPORT void setVelocityBoundaryCondition(SideType sideType, real vx, real vy, real vz);
     GRIDGENERATOR_EXPORT void setPressureBoundaryCondition(SideType sideType, real rho);
     GRIDGENERATOR_EXPORT void setPeriodicBoundaryCondition(bool periodic_X, bool periodic_Y, bool periodic_Z);
@@ -99,6 +101,13 @@ public:
     GRIDGENERATOR_EXPORT virtual void getSlipValues(real* normalX, real* normalY, real* normalZ, int* indices, int level) const override;
     GRIDGENERATOR_EXPORT virtual void getSlipQs(real* qs[27], int level) const override;
 
+    GRIDGENERATOR_EXPORT uint getStressSize(int level) const override;
+    GRIDGENERATOR_EXPORT virtual void getStressValues(  real* normalX, real* normalY, real* normalZ, 
+                                                        real* vx,      real* vy,      real* vz, 
+                                                        real* vx1,     real* vy1,     real* vz1, 
+                                                        int* indices, int* samplingIndices, int* samplingOffsets, real* z0, int level) const override;
+    GRIDGENERATOR_EXPORT virtual void getStressQs(real* qs[27], int level) const override;
+        
     GRIDGENERATOR_EXPORT uint getVelocitySize(int level) const override;
     GRIDGENERATOR_EXPORT virtual void getVelocityValues(real* vx, real* vy, real* vz, int* indices, int level) const override;
     GRIDGENERATOR_EXPORT virtual void getVelocityQs(real* qs[27], int level) const override;
@@ -127,11 +136,13 @@ protected:
 
         std::vector<SPtr<SlipBoundaryCondition>> slipBoundaryConditions;
 
+        std::vector<SPtr<StressBoundaryCondition>> stressBoundaryConditions;
+
         std::vector<SPtr<VelocityBoundaryCondition>> velocityBoundaryConditions;
 
         std::vector<SPtr<PressureBoundaryCondition>> pressureBoundaryConditions;
 
-        std::vector<SPtr<VelocityBoundaryCondition> > noSlipBoundaryConditions;
+        std::vector<SPtr<VelocityBoundaryCondition>> noSlipBoundaryConditions;
 
         SPtr<GeometryBoundaryCondition> geometryBoundaryCondition;
     };
diff --git a/src/gpu/GridGenerator/grid/GridImp.cpp b/src/gpu/GridGenerator/grid/GridImp.cpp
index 56a0cc6870a59de9116c3ac3837db7e08f4308b5..7eda4f9b8e5a374347b8572f3a28a947be5ad9cb 100644
--- a/src/gpu/GridGenerator/grid/GridImp.cpp
+++ b/src/gpu/GridGenerator/grid/GridImp.cpp
@@ -186,7 +186,7 @@ void GridImp::inital(const SPtr<Grid> fineGrid, uint numberOfLayers)
 #pragma omp parallel for
     for (int index = 0; index < (int)this->size; index++)
         this->findEndOfGridStopperNode(index);
-
+    
     *logging::out << logging::Logger::INFO_INTERMEDIATE
         << "Grid created: " << "from (" << this->startX << ", " << this->startY << ", " << this->startZ << ") to (" << this->endX << ", " << this->endY << ", " << this->endZ << ")\n"
         << "nodes: " << this->nx << " x " << this->ny << " x " << this->nz << " = " << this->size << "\n";
@@ -440,7 +440,7 @@ void GridImp::findEndOfGridStopperNode(uint index)
         else
             this->field.setFieldEntryToStopperOutOfGridBoundary(index);
     }
-
+    
 	if (isValidEndOfGridBoundaryStopper(index))
 		this->field.setFieldEntryToStopperOutOfGridBoundary(index);
 }
@@ -1459,7 +1459,6 @@ void GridImp::calculateQs(const uint index, const Vertex &point, Object* object)
                     
                 this->qPatches[ this->qIndices[index] ] = 0;
 
-				//printf("%d %f \n", this->qIndices[index], subdistance);
 			}
 		}
 	}
diff --git a/src/gpu/GridGenerator/grid/GridImp.h b/src/gpu/GridGenerator/grid/GridImp.h
index 77a3cb0014c6bb9c4d69fe4b62e2fb9646539c89..b096f5ff85dcd725ff065dbb6fc31d75c016c869 100644
--- a/src/gpu/GridGenerator/grid/GridImp.h
+++ b/src/gpu/GridGenerator/grid/GridImp.h
@@ -28,7 +28,7 @@
 //
 //! \file GridImp.h
 //! \ingroup grid
-//! \author Soeren Peters, Stephan Lenz, Martin Schönherr
+//! \author Soeren Peters, Stephan Lenz, Martin Sch�nherr
 //=======================================================================================
 #ifndef GRID_IMP_H
 #define GRID_IMP_H
@@ -197,6 +197,7 @@ public:
     void fixRefinementIntoWall(uint xIndex, uint yIndex, uint zIndex, int dir);
     void findStopperNode(uint index);
     void findEndOfGridStopperNode(uint index);
+    void findEndOfGridStopperPeriodicNode(uint index);
     void findSolidStopperNode(uint index);
     void findBoundarySolidNode(uint index);
 
@@ -209,7 +210,7 @@ public:
     bool isNode(uint index, char type) const;
     bool nodeInNextCellIs(int index, char type) const;
     bool hasAllNeighbors(uint index) const;
-    bool hasNeighborOfType(uint index, char type)const;
+    bool hasNeighborOfType(uint index, char type) const;
     bool cellContainsOnly(Cell &cell, char type) const;
     bool cellContainsOnly(Cell &cell, char typeA, char typeB) const;
 
diff --git a/src/gpu/GridGenerator/grid/NodeValues.h b/src/gpu/GridGenerator/grid/NodeValues.h
index c726fdf85c8199633e118d8f8a5365ee658d4e6a..b8312b0673337d11b4bdf0b8052e89d92ce127ef 100644
--- a/src/gpu/GridGenerator/grid/NodeValues.h
+++ b/src/gpu/GridGenerator/grid/NodeValues.h
@@ -56,6 +56,7 @@ static constexpr char BC_SOLID    = 22;
 static constexpr char BC_SLIP    = 23;
 static constexpr char BC_NOSLIP  = 24;
 static constexpr char BC_OUTFLOW = 25;
+static constexpr char BC_STRESS   = 26;
 
 static constexpr char STOPPER_OUT_OF_GRID          = 30;
 static constexpr char STOPPER_COARSE_UNDER_FINE    = 31;
diff --git a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
index 1fec35ed1cf86b09c04bf861a3386cab3b35410d..17d01e57e4c34894e0e0551dd7443dfe92582240 100644
--- a/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
+++ b/src/gpu/VirtualFluids_GPU/Calculation/UpdateGrid27.cpp
@@ -18,7 +18,7 @@ void updateGrid27(Parameter* para,
                   std::vector < SPtr< Kernel>>& kernels)
 {
     //////////////////////////////////////////////////////////////////////////
-
+    
     if( level != para->getFine() )
     {
         updateGrid27(para, comm, cudaManager, pm, level+1, t, kernels);
@@ -26,35 +26,35 @@ void updateGrid27(Parameter* para,
     }
 
     //////////////////////////////////////////////////////////////////////////
-
+    
     collision(para, pm, level, t, kernels);
-
+    
     //////////////////////////////////////////////////////////////////////////
-
+    
     exchangeMultiGPU(para, comm, cudaManager, level);
-
+    
     //////////////////////////////////////////////////////////////////////////
-
+    
     postCollisionBC(para, level, t);
-
+    
     //////////////////////////////////////////////////////////////////////////
 
     swapBetweenEvenAndOddTimestep(para, level);
 
 	//////////////////////////////////////////////////////////////////////////
-
-	if (para->getUseWale())
+    
+    if (para->getUseWale())
 		calcMacroscopicQuantities(para, level);
 
     if (para->getUseTurbulentViscosity())
         calcTurbulentViscosity(para, level);
-
-	//////////////////////////////////////////////////////////////////////////
-
+    
+    //////////////////////////////////////////////////////////////////////////
+    
     preCollisionBC(para, cudaManager, level, t);
-
+    
     //////////////////////////////////////////////////////////////////////////
-
+    
     if( level != para->getFine() )
     {
         fineToCoarse(para, level);
@@ -63,10 +63,11 @@ void updateGrid27(Parameter* para,
 
         coarseToFine(para, level);
     }
-
+    
     interactWithActuators(para, cudaManager, level, t);
-
+    
     interactWithProbes(para, cudaManager, level, t);
+    //////////////////////////////////////////////////////////////////////////
 }
 
 void collision(Parameter* para, std::vector<std::shared_ptr<PorousMedia>>& pm, int level, unsigned int t, std::vector < SPtr< Kernel>>& kernels)
@@ -274,7 +275,6 @@ void postCollisionBC(Parameter* para, int level, unsigned int t)
     //////////////////////////////////////////////////////////////////////////
     // S L I P
     //////////////////////////////////////////////////////////////////////////
-
     if (para->getParD(level)->kSlipQ > 0)
     {
         //QSlipDev27( para->getParD(level)->numberofthreads, para->getParD(level)->d0SP.f[0],    para->getParD(level)->QSlip.k,
@@ -286,10 +286,46 @@ void postCollisionBC(Parameter* para, int level, unsigned int t)
         QSlipDevComp27( para->getParD(level)->numberofthreads, para->getParD(level)->d0SP.f[0],    para->getParD(level)->QSlip.k,
                         para->getParD(level)->QSlip.q27[0],    para->getParD(level)->kSlipQ,       para->getParD(level)->omega,
                         para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP, para->getParD(level)->neighborZ_SP,
+                        para->getParD(level)->turbViscosity,   para->getUseTurbulentViscosity(),
                         para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
         getLastCudaError("QSlipDev27 execution failed");
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    // S T R E S S (wall model)
+    //////////////////////////////////////////////////////////////////////////
+    if (para->getParD(level)->kStressQ > 0)
+    {
+        // QStressDevComp27( para->getParD(level)->numberofthreads, para->getParD(level)->d0SP.f[0], 
+        //                 para->getParD(level)->QStress.k,       para->getParD(level)->QStress.kN, 
+        //                 para->getParD(level)->QStress.q27[0],  para->getParD(level)->kStressQ,          
+        //                 para->getParD(level)->omega,           para->getParD(level)->turbViscosity,  
+        //                 para->getParD(level)->vx_SP,           para->getParD(level)->vy_SP,             para->getParD(level)->vy_SP,
+        //                 para->getParD(level)->QStress.normalX, para->getParD(level)->QStress.normalY,   para->getParD(level)->QStress.normalZ,
+        //                 para->getParD(level)->QStress.Vx,      para->getParD(level)->QStress.Vy,        para->getParD(level)->QStress.Vz,
+        //                 para->getParD(level)->QStress.Vx1,     para->getParD(level)->QStress.Vy1,       para->getParD(level)->QStress.Vz1,
+        //                 para->getParD(level)->wallModel.samplingOffset, para->getParD(level)->wallModel.z0,
+                        // para->getHasWallModelMonitor(),        para->getParD(level)->wallModel.u_star,
+                        // para->getParD(level)->wallModel.Fx,    para->getParD(level)->wallModel.Fy,      para->getParD(level)->wallModel.Fz,
+        //                 para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP,      para->getParD(level)->neighborZ_SP, 
+        //                 para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
+        // getLastCudaError("QStressDevComp27 execution failed");
+
+        BBStressDev27( para->getParD(level)->numberofthreads, para->getParD(level)->d0SP.f[0], 
+                        para->getParD(level)->QStress.k,       para->getParD(level)->QStress.kN, 
+                        para->getParD(level)->QStress.q27[0],  para->getParD(level)->kStressQ,          
+                        para->getParD(level)->vx_SP,           para->getParD(level)->vy_SP,             para->getParD(level)->vy_SP,
+                        para->getParD(level)->QStress.normalX, para->getParD(level)->QStress.normalY,   para->getParD(level)->QStress.normalZ,
+                        para->getParD(level)->QStress.Vx,      para->getParD(level)->QStress.Vy,        para->getParD(level)->QStress.Vz,
+                        para->getParD(level)->QStress.Vx1,     para->getParD(level)->QStress.Vy1,       para->getParD(level)->QStress.Vz1,
+                        para->getParD(level)->wallModel.samplingOffset, para->getParD(level)->wallModel.z0,
+                        para->getHasWallModelMonitor(),        para->getParD(level)->wallModel.u_star,
+                        para->getParD(level)->wallModel.Fx,    para->getParD(level)->wallModel.Fy,      para->getParD(level)->wallModel.Fz,
+                        para->getParD(level)->neighborX_SP,    para->getParD(level)->neighborY_SP,      para->getParD(level)->neighborZ_SP, 
+                        para->getParD(level)->size_Mat_SP,     para->getParD(level)->evenOrOdd);
+        getLastCudaError("BBStressDevice27 execution failed");
+    }
+
     //////////////////////////////////////////////////////////////////////////
     // G E O M E T R Y
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
index f165666f56617bb9d47a9c37a2fe8f5629511b35..9f2bfa4d2ac004237d7a7e62d04496089b05db61 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
@@ -112,6 +112,53 @@ void GridGenerator::allocArrays_BoundaryValues()
             cudaMemoryManager->cudaCopyPress(level);
         }
     }
+
+    for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
+        const auto numberOfSlipValues = int(builder->getSlipSize(level));
+
+        std::cout << "size slip level " << level << " : " << numberOfSlipValues << std::endl;
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        para->getParH(level)->QSlip.kQ = numberOfSlipValues;
+        para->getParD(level)->QSlip.kQ = numberOfSlipValues;
+        para->getParH(level)->kSlipQ   = numberOfSlipValues;
+        para->getParD(level)->kSlipQ   = numberOfSlipValues;
+        para->getParH(level)->kSlipQread = numberOfSlipValues * para->getD3Qxx();
+        para->getParD(level)->kSlipQread = numberOfSlipValues * para->getD3Qxx();
+        if (numberOfSlipValues > 1)
+        {
+            cudaMemoryManager->cudaAllocSlipBC(level);
+            builder->getSlipValues(para->getParH(level)->QSlip.normalX, para->getParH(level)->QSlip.normalY, para->getParH(level)->QSlip.normalZ, para->getParH(level)->QSlip.k, level);
+            cudaMemoryManager->cudaCopySlipBC(level);
+        }
+    }
+
+    for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
+        const auto numberOfStressValues = int(builder->getStressSize(level));
+
+        std::cout << "size stress level " << level << " : " << numberOfStressValues << std::endl;
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        para->getParH(level)->QStress.kQ = numberOfStressValues;
+        para->getParD(level)->QStress.kQ = numberOfStressValues;
+        para->getParH(level)->kStressQ   = numberOfStressValues;
+        para->getParD(level)->kStressQ   = numberOfStressValues;
+        para->getParH(level)->kStressQread = numberOfStressValues * para->getD3Qxx();
+        para->getParD(level)->kStressQread = numberOfStressValues * para->getD3Qxx();
+
+        if (numberOfStressValues > 1)
+        {
+            cudaMemoryManager->cudaAllocStressBC(level);
+            cudaMemoryManager->cudaAllocWallModel(level, para->getHasWallModelMonitor());
+            builder->getStressValues(   para->getParH(level)->QStress.normalX,  para->getParH(level)->QStress.normalY,  para->getParH(level)->QStress.normalZ, 
+                                        para->getParH(level)->QStress.Vx,       para->getParH(level)->QStress.Vy,       para->getParH(level)->QStress.Vz,
+                                        para->getParH(level)->QStress.Vx1,      para->getParH(level)->QStress.Vy1,      para->getParH(level)->QStress.Vz1,
+                                        para->getParH(level)->QStress.k,        para->getParH(level)->QStress.kN,       
+                                        para->getParH(level)->wallModel.samplingOffset, para->getParH(level)->wallModel.z0, 
+                                        level);
+
+            cudaMemoryManager->cudaCopyStressBC(level);
+            cudaMemoryManager->cudaCopyWallModel(level, para->getHasWallModelMonitor());
+        }
+    }
     
 
     for (uint level = 0; level < builder->getNumberOfGridLevels(); level++) {
@@ -137,18 +184,6 @@ void GridGenerator::allocArrays_BoundaryValues()
 
             builder->getVelocityValues(para->getParH(level)->Qinflow.Vx, para->getParH(level)->Qinflow.Vy, para->getParH(level)->Qinflow.Vz, para->getParH(level)->Qinflow.k, level);
 
-
-            //for (int i = 0; i < numberOfVelocityValues; i++)
-            //{
-            //    std::cout << "index: " << para->getParH(level)->Qinflow.k[i];
-            //    std::cout << " (x,y,z)" << para->getParH(level)->coordX_SP[para->getParH(level)->Qinflow.k[i]];
-            //    std::cout << ", " << para->getParH(level)->coordY_SP[para->getParH(level)->Qinflow.k[i]];
-            //    std::cout << ", " << para->getParH(level)->coordZ_SP[para->getParH(level)->Qinflow.k[i]];
-            //    std::cout << " geo: " << para->getParH(level)->geoSP[para->getParH(level)->Qinflow.k[i]];
-            //    std::cout << std::endl;
-            //}
-
-
             ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
             cudaMemoryManager->cudaCopyVeloBC(level);
@@ -697,7 +732,97 @@ void GridGenerator::allocArrays_BoundaryQs()
         }//ende if
     }//ende oberste for schleife
 
+    for (uint i = 0; i < builder->getNumberOfGridLevels(); i++) {
+        int numberOfSlipValues = (int)builder->getSlipSize(i);
+        if (numberOfSlipValues > 0)
+        {
+            std::cout << "size Slip:  " << i << " : " << numberOfSlipValues << std::endl;
+            //cout << "Groesse Pressure:  " << i << " : " << temp1 << "MyID: " << para->getMyID() << endl;
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            //preprocessing
+            real* QQ = para->getParH(i)->QSlip.q27[0];
+            unsigned int sizeQ = para->getParH(i)->QSlip.kQ;
+            QforBoundaryConditions Q;
+            Q.q27[dirE] = &QQ[dirE   *sizeQ];
+            Q.q27[dirW] = &QQ[dirW   *sizeQ];
+            Q.q27[dirN] = &QQ[dirN   *sizeQ];
+            Q.q27[dirS] = &QQ[dirS   *sizeQ];
+            Q.q27[dirT] = &QQ[dirT   *sizeQ];
+            Q.q27[dirB] = &QQ[dirB   *sizeQ];
+            Q.q27[dirNE] = &QQ[dirNE  *sizeQ];
+            Q.q27[dirSW] = &QQ[dirSW  *sizeQ];
+            Q.q27[dirSE] = &QQ[dirSE  *sizeQ];
+            Q.q27[dirNW] = &QQ[dirNW  *sizeQ];
+            Q.q27[dirTE] = &QQ[dirTE  *sizeQ];
+            Q.q27[dirBW] = &QQ[dirBW  *sizeQ];
+            Q.q27[dirBE] = &QQ[dirBE  *sizeQ];
+            Q.q27[dirTW] = &QQ[dirTW  *sizeQ];
+            Q.q27[dirTN] = &QQ[dirTN  *sizeQ];
+            Q.q27[dirBS] = &QQ[dirBS  *sizeQ];
+            Q.q27[dirBN] = &QQ[dirBN  *sizeQ];
+            Q.q27[dirTS] = &QQ[dirTS  *sizeQ];
+            Q.q27[dirZERO] = &QQ[dirZERO*sizeQ];
+            Q.q27[dirTNE] = &QQ[dirTNE *sizeQ];
+            Q.q27[dirTSW] = &QQ[dirTSW *sizeQ];
+            Q.q27[dirTSE] = &QQ[dirTSE *sizeQ];
+            Q.q27[dirTNW] = &QQ[dirTNW *sizeQ];
+            Q.q27[dirBNE] = &QQ[dirBNE *sizeQ];
+            Q.q27[dirBSW] = &QQ[dirBSW *sizeQ];
+            Q.q27[dirBSE] = &QQ[dirBSE *sizeQ];
+            Q.q27[dirBNW] = &QQ[dirBNW *sizeQ];
+            
+            builder->getSlipQs(Q.q27, i);
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            cudaMemoryManager->cudaCopySlipBC(i);
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        }//ende if
+    }//ende oberste for schleife
 
+    for (uint i = 0; i < builder->getNumberOfGridLevels(); i++) {
+        int numberOfStressValues = (int)builder->getStressSize(i);
+        if (numberOfStressValues > 0)
+        {
+            std::cout << "size Stress:  " << i << " : " << numberOfStressValues << std::endl;
+            //cout << "Groesse Pressure:  " << i << " : " << temp1 << "MyID: " << para->getMyID() << endl;
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            //preprocessing
+            real* QQ = para->getParH(i)->QStress.q27[0];
+            unsigned int sizeQ = para->getParH(i)->QStress.kQ;
+            QforBoundaryConditions Q;
+            Q.q27[dirE] = &QQ[dirE   *sizeQ];
+            Q.q27[dirW] = &QQ[dirW   *sizeQ];
+            Q.q27[dirN] = &QQ[dirN   *sizeQ];
+            Q.q27[dirS] = &QQ[dirS   *sizeQ];
+            Q.q27[dirT] = &QQ[dirT   *sizeQ];
+            Q.q27[dirB] = &QQ[dirB   *sizeQ];
+            Q.q27[dirNE] = &QQ[dirNE  *sizeQ];
+            Q.q27[dirSW] = &QQ[dirSW  *sizeQ];
+            Q.q27[dirSE] = &QQ[dirSE  *sizeQ];
+            Q.q27[dirNW] = &QQ[dirNW  *sizeQ];
+            Q.q27[dirTE] = &QQ[dirTE  *sizeQ];
+            Q.q27[dirBW] = &QQ[dirBW  *sizeQ];
+            Q.q27[dirBE] = &QQ[dirBE  *sizeQ];
+            Q.q27[dirTW] = &QQ[dirTW  *sizeQ];
+            Q.q27[dirTN] = &QQ[dirTN  *sizeQ];
+            Q.q27[dirBS] = &QQ[dirBS  *sizeQ];
+            Q.q27[dirBN] = &QQ[dirBN  *sizeQ];
+            Q.q27[dirTS] = &QQ[dirTS  *sizeQ];
+            Q.q27[dirZERO] = &QQ[dirZERO*sizeQ];
+            Q.q27[dirTNE] = &QQ[dirTNE *sizeQ];
+            Q.q27[dirTSW] = &QQ[dirTSW *sizeQ];
+            Q.q27[dirTSE] = &QQ[dirTSE *sizeQ];
+            Q.q27[dirTNW] = &QQ[dirTNW *sizeQ];
+            Q.q27[dirBNE] = &QQ[dirBNE *sizeQ];
+            Q.q27[dirBSW] = &QQ[dirBSW *sizeQ];
+            Q.q27[dirBSE] = &QQ[dirBSE *sizeQ];
+            Q.q27[dirBNW] = &QQ[dirBNW *sizeQ];
+            
+            builder->getStressQs(Q.q27, i);
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            cudaMemoryManager->cudaCopyStressBC(i);
+            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        }//ende if
+    }//ende oberste for schleife
 
     for (uint i = 0; i < builder->getNumberOfGridLevels(); i++) {
         const auto numberOfVelocityNodes = int(builder->getVelocitySize(i));
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
index 6685a34b98c03067426db7452b63c060e1723058..4e8eb124731cffb54a51018fa6f06da45f671c73 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
+++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.cpp
@@ -1355,15 +1355,19 @@ void CudaMemoryManager::cudaAllocSlipBC(int lev)
     //Host
     checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QSlip.q27[0]), parameter->getD3Qxx()*mem_size_Q_q      ));
     checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QSlip.k),                            mem_size_Q_k      ));
-    //checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QSlip.qread),             mem_size_Q_q_read ));//Geller
-    //checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QSlip.valueQ),            mem_size_Q_value  ));//Geller
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QSlip.normalX),                      mem_size_Q_q    ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QSlip.normalY),                      mem_size_Q_q    ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QSlip.normalZ),                      mem_size_Q_q    ));
     
     //Device
     checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QSlip.q27[0]),     parameter->getD3Qxx()* mem_size_Q_q     ));
     checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QSlip.k),                                 mem_size_Q_k     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QSlip.normalX),                           mem_size_Q_q     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QSlip.normalY),                           mem_size_Q_q     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QSlip.normalZ),                           mem_size_Q_q     ));
     
     //////////////////////////////////////////////////////////////////////////
-    double tmp = (double)mem_size_Q_k + (double)parameter->getD3Qxx()*(double)mem_size_Q_q;
+    double tmp = (double)mem_size_Q_k + (double)parameter->getD3Qxx()*(double)mem_size_Q_q + 3.0*(double)mem_size_Q_q;;
     setMemsizeGPU(tmp, false);
 }
 void CudaMemoryManager::cudaCopySlipBC(int lev)
@@ -1371,15 +1375,150 @@ void CudaMemoryManager::cudaCopySlipBC(int lev)
     unsigned int mem_size_Q_k = sizeof(int)*parameter->getParH(lev)->QSlip.kQ;
     unsigned int mem_size_Q_q = sizeof(real)*parameter->getParH(lev)->QSlip.kQ;
     
-    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QSlip.q27[0], parameter->getParH(lev)->QSlip.q27[0], parameter->getD3Qxx()* mem_size_Q_q,       cudaMemcpyHostToDevice));
-    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QSlip.k,      parameter->getParH(lev)->QSlip.k,                  mem_size_Q_k,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QSlip.q27[0], parameter->getParH(lev)->QSlip.q27[0],  parameter->getD3Qxx()* mem_size_Q_q,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QSlip.k,      parameter->getParH(lev)->QSlip.k,                              mem_size_Q_k,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QSlip.normalX, parameter->getParH(lev)->QSlip.normalX,                       mem_size_Q_q,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QSlip.normalY, parameter->getParH(lev)->QSlip.normalY,                       mem_size_Q_q,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QSlip.normalZ, parameter->getParH(lev)->QSlip.normalZ,                       mem_size_Q_q,       cudaMemcpyHostToDevice));
 }
 void CudaMemoryManager::cudaFreeSlipBC(int lev)
 {
     checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QSlip.q27[0]));
     checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QSlip.k));
-    //checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QSlip.valueQ));
-    //checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QSlip.qread));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QSlip.normalX));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QSlip.normalY));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QSlip.normalZ));
+}
+//Stress
+void CudaMemoryManager::cudaAllocStressBC(int lev)
+{
+    unsigned int mem_size_Q_k      = sizeof(int)*parameter->getParH(lev)->QStress.kQ;
+    unsigned int mem_size_Q_q      = sizeof(real)*parameter->getParH(lev)->QStress.kQ;
+    
+    //Host
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QStress.q27[0]), parameter->getD3Qxx()*mem_size_Q_q      ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QStress.k),                            mem_size_Q_k      ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QStress.kN),                           mem_size_Q_k     ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QStress.normalX),                      mem_size_Q_q      ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QStress.normalY),                      mem_size_Q_q      ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QStress.normalZ),                      mem_size_Q_q      ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QStress.Vx),                           mem_size_Q_q      ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QStress.Vy),                           mem_size_Q_q      ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QStress.Vz),                           mem_size_Q_q      ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QStress.Vx1),                          mem_size_Q_q      ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QStress.Vy1),                          mem_size_Q_q      ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->QStress.Vz1),                          mem_size_Q_q      ));
+    
+    //Device
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QStress.q27[0]),     parameter->getD3Qxx()* mem_size_Q_q     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QStress.k),                                 mem_size_Q_k     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QStress.kN),                                mem_size_Q_k    ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QStress.normalX),                           mem_size_Q_q     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QStress.normalY),                           mem_size_Q_q     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QStress.normalZ),                           mem_size_Q_q     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QStress.Vx),                                mem_size_Q_q     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QStress.Vy),                                mem_size_Q_q     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QStress.Vz),                                mem_size_Q_q     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QStress.Vx1),                               mem_size_Q_q     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QStress.Vy1),                               mem_size_Q_q     ));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->QStress.Vz1),                               mem_size_Q_q     ));
+    
+    //////////////////////////////////////////////////////////////////////////
+    double tmp = 2*(double)mem_size_Q_k + (double)parameter->getD3Qxx()*(double)mem_size_Q_q + 9.0*(double)mem_size_Q_q;
+    setMemsizeGPU(tmp, false);
+}
+void CudaMemoryManager::cudaCopyStressBC(int lev)
+{
+    unsigned int mem_size_Q_k = sizeof(int)*parameter->getParH(lev)->QStress.kQ;
+    unsigned int mem_size_Q_q = sizeof(real)*parameter->getParH(lev)->QStress.kQ;
+    
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QStress.q27[0],  parameter->getParH(lev)->QStress.q27[0], parameter->getD3Qxx()* mem_size_Q_q,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QStress.k,       parameter->getParH(lev)->QStress.k,                             mem_size_Q_k,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QStress.kN,      parameter->getParH(lev)->QStress.kN,                            mem_size_Q_k,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QStress.normalX, parameter->getParH(lev)->QStress.normalX,                       mem_size_Q_q,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QStress.normalY, parameter->getParH(lev)->QStress.normalY,                       mem_size_Q_q,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QStress.normalZ, parameter->getParH(lev)->QStress.normalZ,                       mem_size_Q_q,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QStress.Vx,      parameter->getParH(lev)->QStress.Vx,                            mem_size_Q_q,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QStress.Vy,      parameter->getParH(lev)->QStress.Vy,                            mem_size_Q_q,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QStress.Vz,      parameter->getParH(lev)->QStress.Vz,                            mem_size_Q_q,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QStress.Vx1,     parameter->getParH(lev)->QStress.Vx1,                           mem_size_Q_q,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QStress.Vy1,     parameter->getParH(lev)->QStress.Vy1,                           mem_size_Q_q,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->QStress.Vz1,     parameter->getParH(lev)->QStress.Vz1,                           mem_size_Q_q,       cudaMemcpyHostToDevice));
+
+}
+void CudaMemoryManager::cudaFreeStressBC(int lev)
+{
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QStress.q27[0]));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QStress.k));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QStress.kN));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QStress.normalX));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QStress.normalY));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QStress.normalZ));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QStress.Vx));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QStress.Vy));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QStress.Vz));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QStress.Vx1));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QStress.Vy1));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->QStress.Vz1));
+}
+// Wall model
+void CudaMemoryManager::cudaAllocWallModel(int lev, bool hasWallModelMonitor)
+{
+    unsigned int mem_size_Q_k      = sizeof(int)*parameter->getParH(lev)->QStress.kQ;
+    unsigned int mem_size_Q_q      = sizeof(real)*parameter->getParH(lev)->QStress.kQ;
+    
+    //Host
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->wallModel.samplingOffset),  mem_size_Q_k      ));
+    checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->wallModel.z0),              mem_size_Q_q      ));
+    if(hasWallModelMonitor) 
+    {
+        checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->wallModel.u_star),      mem_size_Q_q      ));
+        checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->wallModel.Fx),          mem_size_Q_q      ));
+        checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->wallModel.Fy),          mem_size_Q_q      ));
+        checkCudaErrors( cudaMallocHost((void**) &(parameter->getParH(lev)->wallModel.Fz),          mem_size_Q_q      ));
+    }
+    
+    //Device
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->wallModel.samplingOffset),  mem_size_Q_k));
+    checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->wallModel.z0),  mem_size_Q_q));
+    if(hasWallModelMonitor) 
+    {
+        checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->wallModel.u_star),      mem_size_Q_q      ));
+        checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->wallModel.Fx),          mem_size_Q_q      ));
+        checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->wallModel.Fy),          mem_size_Q_q      ));
+        checkCudaErrors( cudaMalloc((void**) &(parameter->getParD(lev)->wallModel.Fz),          mem_size_Q_q      ));
+    }
+    
+    //////////////////////////////////////////////////////////////////////////
+    double tmp = (double)mem_size_Q_k + (double)mem_size_Q_q;
+    setMemsizeGPU(tmp, false);
+}
+void CudaMemoryManager::cudaCopyWallModel(int lev, bool hasWallModelMonitor)
+{
+    unsigned int mem_size_Q_k      = sizeof(int)*parameter->getParH(lev)->QStress.kQ;
+    unsigned int mem_size_Q_q      = sizeof(real)*parameter->getParH(lev)->QStress.kQ;
+    
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->wallModel.samplingOffset,  parameter->getParH(lev)->wallModel.samplingOffset,  mem_size_Q_k,       cudaMemcpyHostToDevice));
+    checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->wallModel.z0,              parameter->getParH(lev)->wallModel.z0,              mem_size_Q_q,       cudaMemcpyHostToDevice));
+    if(hasWallModelMonitor)
+    {
+        checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->wallModel.u_star,          parameter->getParH(lev)->wallModel.u_star,          mem_size_Q_k,       cudaMemcpyHostToDevice));
+        checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->wallModel.Fx,              parameter->getParH(lev)->wallModel.Fx,              mem_size_Q_q,       cudaMemcpyHostToDevice));
+        checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->wallModel.Fy,              parameter->getParH(lev)->wallModel.Fy,              mem_size_Q_q,       cudaMemcpyHostToDevice));
+        checkCudaErrors( cudaMemcpy(parameter->getParD(lev)->wallModel.Fz,              parameter->getParH(lev)->wallModel.Fz,              mem_size_Q_q,       cudaMemcpyHostToDevice));
+    }
+}
+void CudaMemoryManager::cudaFreeWallModel(int lev, bool hasWallModelMonitor)
+{
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->wallModel.samplingOffset));
+    checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->wallModel.z0));
+    if(hasWallModelMonitor)
+    {
+        checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->wallModel.u_star));
+        checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->wallModel.Fx));
+        checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->wallModel.Fy));
+        checkCudaErrors( cudaFreeHost(parameter->getParH(lev)->wallModel.Fz));
+    }
 }
 
 //Test roundoff error
@@ -2875,18 +3014,18 @@ void CudaMemoryManager::cudaFreeProbeDistances(Probe* probe, int level)
 
 void CudaMemoryManager::cudaAllocProbeIndices(Probe* probe, int level)
 {
-    size_t tmp = sizeof(int)*probe->getProbeStruct(level)->nPoints;
+    size_t tmp = sizeof(int)*probe->getProbeStruct(level)->nIndices;
     checkCudaErrors( cudaMallocHost((void**) &probe->getProbeStruct(level)->pointIndicesH, tmp) );
     checkCudaErrors( cudaMalloc    ((void**) &probe->getProbeStruct(level)->pointIndicesD, tmp) );
     setMemsizeGPU(1.f*tmp, false);
 }
 void CudaMemoryManager::cudaCopyProbeIndicesHtoD(Probe* probe, int level)
 {
-    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->pointIndicesD, probe->getProbeStruct(level)->pointIndicesH, sizeof(int)*probe->getProbeStruct(level)->nPoints, cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->pointIndicesD, probe->getProbeStruct(level)->pointIndicesH, sizeof(int)*probe->getProbeStruct(level)->nIndices, cudaMemcpyHostToDevice) );
 }
 void CudaMemoryManager::cudaCopyProbeIndicesDtoH(Probe* probe, int level)
 {
-    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->pointIndicesH, probe->getProbeStruct(level)->pointIndicesD, sizeof(int)*probe->getProbeStruct(level)->nPoints, cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->pointIndicesH, probe->getProbeStruct(level)->pointIndicesD, sizeof(int)*probe->getProbeStruct(level)->nIndices, cudaMemcpyDeviceToHost) );
 }
 void CudaMemoryManager::cudaFreeProbeIndices(Probe* probe, int level)
 {
@@ -2899,8 +3038,11 @@ void CudaMemoryManager::cudaAllocProbeQuantityArray(Probe* probe, int level)
     size_t tmp = sizeof(real)*probe->getProbeStruct(level)->nArrays*probe->getProbeStruct(level)->nPoints;
 
     checkCudaErrors( cudaMallocHost((void**) &probe->getProbeStruct(level)->quantitiesArrayH, tmp) );
-    checkCudaErrors( cudaMalloc    ((void**) &probe->getProbeStruct(level)->quantitiesArrayD, tmp) );
-    setMemsizeGPU(1.f*tmp, false);
+    if(probe->getHasDeviceQuantityArray())
+    {
+        checkCudaErrors( cudaMalloc    ((void**) &probe->getProbeStruct(level)->quantitiesArrayD, tmp) );
+        setMemsizeGPU(1.f*tmp, false);
+    }
 }
 
 void CudaMemoryManager::cudaCopyProbeQuantityArrayHtoD(Probe* probe, int level)
@@ -2914,13 +3056,14 @@ void CudaMemoryManager::cudaCopyProbeQuantityArrayDtoH(Probe* probe, int level)
 void CudaMemoryManager::cudaFreeProbeQuantityArray(Probe* probe, int level)
 {
     checkCudaErrors( cudaFreeHost(probe->getProbeStruct(level)->quantitiesArrayH) );
-    checkCudaErrors( cudaFree    (probe->getProbeStruct(level)->quantitiesArrayD) );
+    if(probe->getHasDeviceQuantityArray())
+        checkCudaErrors( cudaFree    (probe->getProbeStruct(level)->quantitiesArrayD) );
 }
 
 void CudaMemoryManager::cudaAllocProbeQuantitiesAndOffsets(Probe* probe, int level)
 {
-    size_t tmpA = int(PostProcessingVariable::LAST)*sizeof(int);
-    size_t tmpQ = int(PostProcessingVariable::LAST)*sizeof(bool);
+    size_t tmpA = int(Statistic::LAST)*sizeof(int);
+    size_t tmpQ = int(Statistic::LAST)*sizeof(bool);
     checkCudaErrors( cudaMallocHost((void**) &probe->getProbeStruct(level)->quantitiesH, tmpQ) );    
     checkCudaErrors( cudaMalloc    ((void**) &probe->getProbeStruct(level)->quantitiesD, tmpQ) );
     checkCudaErrors( cudaMallocHost((void**) &probe->getProbeStruct(level)->arrayOffsetsH, tmpA) );    
@@ -2930,14 +3073,14 @@ void CudaMemoryManager::cudaAllocProbeQuantitiesAndOffsets(Probe* probe, int lev
 
 void CudaMemoryManager::cudaCopyProbeQuantitiesAndOffsetsHtoD(Probe* probe, int level)
 {    
-    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->quantitiesD, probe->getProbeStruct(level)->quantitiesH, int(PostProcessingVariable::LAST)*sizeof(bool), cudaMemcpyHostToDevice) );
-    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->arrayOffsetsD, probe->getProbeStruct(level)->arrayOffsetsH, int(PostProcessingVariable::LAST)*sizeof(int), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->quantitiesD, probe->getProbeStruct(level)->quantitiesH, int(Statistic::LAST)*sizeof(bool), cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->arrayOffsetsD, probe->getProbeStruct(level)->arrayOffsetsH, int(Statistic::LAST)*sizeof(int), cudaMemcpyHostToDevice) );
 }
 
 void CudaMemoryManager::cudaCopyProbeQuantitiesAndOffsetsDtoH(Probe* probe, int level)
 {
-    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->quantitiesH, probe->getProbeStruct(level)->quantitiesD, int(PostProcessingVariable::LAST)*sizeof(bool), cudaMemcpyDeviceToHost) );
-    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->arrayOffsetsH, probe->getProbeStruct(level)->arrayOffsetsD, int(PostProcessingVariable::LAST)*sizeof(int), cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->quantitiesH, probe->getProbeStruct(level)->quantitiesD, int(Statistic::LAST)*sizeof(bool), cudaMemcpyDeviceToHost) );
+    checkCudaErrors( cudaMemcpy(probe->getProbeStruct(level)->arrayOffsetsH, probe->getProbeStruct(level)->arrayOffsetsD, int(Statistic::LAST)*sizeof(int), cudaMemcpyDeviceToHost) );
 }
 void CudaMemoryManager::cudaFreeProbeQuantitiesAndOffsets(Probe* probe, int level)
 {
diff --git a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
index e81497cb4e5824afabc3303a984817fe3a2ff68b..27b16240cb63b4505017a7dc50e3a5fc9b19ce82 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
+++ b/src/gpu/VirtualFluids_GPU/GPU/CudaMemoryManager.h
@@ -164,6 +164,14 @@ public:
     void cudaAllocSlipBC(int lev);
     void cudaCopySlipBC(int lev);
     void cudaFreeSlipBC(int lev);
+
+    void cudaAllocStressBC(int lev);
+    void cudaCopyStressBC(int lev);
+    void cudaFreeStressBC(int lev);
+
+    void cudaAllocWallModel(int lev, bool hasWallModelMonitor);
+    void cudaCopyWallModel(int lev,  bool hasWallModelMonitor);
+    void cudaFreeWallModel(int lev,  bool hasWallModelMonitor);
     
     void cudaAllocGeomValuesBC(int lev);
     void cudaCopyGeomValuesBC(int lev);
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h b/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
index dfdbac44d197e55e3e78eb794692fd9443cb7ab6..f7b89610d09cec436ebc6cb0e4473dbf6245c847 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
+++ b/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
@@ -963,6 +963,8 @@ extern "C" void QSlipDevComp27(unsigned int numberOfThreads,
 							   unsigned int* neighborX,
 							   unsigned int* neighborY,
 							   unsigned int* neighborZ,
+							   real* turbViscosity,
+                        	   bool useTurbViscosity,
 							   unsigned int size_Mat, 
 							   bool evenOrOdd);
 
@@ -996,6 +998,70 @@ extern "C" void QSlipNormDevComp27(unsigned int numberOfThreads,
 								   unsigned int size_Mat, 
 								   bool evenOrOdd);
 
+extern "C" void QStressDevComp27(unsigned int numberOfThreads,
+								real* DD, 
+								int* k_Q, 
+								int* k_N,
+								real* QQ,
+								unsigned int sizeQ,
+								real om1, 
+								real* turbViscosity,
+								real* vx,
+								real* vy,
+								real* vz,
+								real* normalX,
+								real* normalY,
+								real* normalZ,
+								real* vx_el,
+								real* vy_el,
+								real* vz_el,
+								real* vx_w_mean,
+								real* vy_w_mean,
+								real* vz_w_mean,
+								int* samplingOffset,
+								real* z0,
+								bool  hasWallModelMonitor,
+								real* u_star,
+								real* Fx,
+								real* Fy,
+								real* Fz,
+								unsigned int* neighborX,
+								unsigned int* neighborY,
+								unsigned int* neighborZ,
+								unsigned int size_Mat, 
+								bool evenOrOdd);
+
+extern "C" void BBStressDev27(  unsigned int numberOfThreads,
+								real* DD, 
+								int* k_Q, 
+								int* k_N, 
+								real* QQ,
+								unsigned int sizeQ,
+								real* vx,
+								real* vy,
+								real* vz,
+								real* normalX,
+								real* normalY,
+								real* normalZ,
+								real* vx_el,
+								real* vy_el,
+								real* vz_el,
+								real* vx_w_mean,
+								real* vy_w_mean,
+								real* vz_w_mean,
+								int* samplingOffset,
+								real* z0,
+								bool  hasWallModelMonitor,
+								real* u_star,
+								real* Fx,
+								real* Fy,
+								real* Fz,
+								unsigned int* neighborX,
+								unsigned int* neighborY,
+								unsigned int* neighborZ,
+								unsigned int size_Mat, 
+								bool evenOrOdd);
+
 extern "C" void QPressDev27(unsigned int numberOfThreads,
                           int nx,
                           int ny,
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh b/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
index 288db43e7bcd36dc4d187982b86178d345601094..d7d38baf2bcf6f5d3abe342359b7676f4ad8266b 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
+++ b/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
@@ -916,6 +916,18 @@ extern "C" __global__ void QSlipDeviceComp27(real* DD,
 											 unsigned int size_Mat, 
 											 bool evenOrOdd);
 
+extern "C" __global__ void QSlipDeviceComp27TurbViscosity(real* DD, 
+											 int* k_Q, 
+											 real* QQ,
+											 unsigned int sizeQ,
+											 real om1, 
+											 unsigned int* neighborX,
+											 unsigned int* neighborY,
+											 unsigned int* neighborZ,
+											 real* turbViscosity,
+											 unsigned int size_Mat, 
+											 bool evenOrOdd);
+
 extern "C" __global__ void QSlipGeomDeviceComp27(real* DD, 
 												 int* k_Q, 
 												 real* QQ,
@@ -944,6 +956,69 @@ extern "C" __global__ void QSlipNormDeviceComp27(real* DD,
 												 unsigned int size_Mat, 
 												 bool evenOrOdd);
 
+// Stress BCs (wall model)
+extern "C" __global__ void QStressDeviceComp27(real* DD, 
+											   int* k_Q,
+											 int* k_N, 
+											 real* QQ,
+											 unsigned int sizeQ,
+											 real om1, 
+											 real* turbViscosity,
+										     real* vx,
+											 real* vy,
+                                    	     real* vz,
+											 real* normalX,
+											 real* normalY,
+                                    	     real* normalZ,
+											 real* vx_bc,
+											 real* vy_bc,
+                                    	     real* vz_bc,
+											 real* vx1,
+                                    		 real* vy1,
+                                    		 real* vz1,
+											 int* samplingOffset,
+											 real* z0,
+											 bool  hasWallModelMonitor,
+											real* u_star_monitor,
+											real* Fx_monitor,
+											real* Fy_monitor,
+											real* Fz_monitor,
+											 unsigned int* neighborX,
+											 unsigned int* neighborY,
+											 unsigned int* neighborZ,
+											 unsigned int size_Mat, 
+											 bool evenOrOdd);
+
+extern "C" __global__ void BBStressDevice27( real* DD, 
+												int* k_Q, 
+												int* k_N, 
+												real* QQ,
+												unsigned int sizeQ,
+												real* vx,
+												real* vy,
+												real* vz,
+												real* normalX,
+												real* normalY,
+												real* normalZ,
+												real* vx_bc,
+												real* vy_bc,
+												real* vz_bc,
+												real* vx1,
+												real* vy1,
+												real* vz1,
+												int* samplingOffset,
+												real* z0,
+												bool  hasWallModelMonitor,
+												real* u_star_monitor,
+												real* Fx_monitor,
+												real* Fy_monitor,
+												real* Fz_monitor,
+												unsigned int* neighborX,
+												unsigned int* neighborY,
+												unsigned int* neighborZ,
+												unsigned int size_Mat, 
+												bool evenOrOdd);
+
 //Pressure BCs
 extern "C" __global__ void QPressDevice27(int inx,
                                            int iny,
diff --git a/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu b/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
index 8b80e5ea9dbed3deef4c4332b2d43bf62ba9e48b..4dce487fc98ee077798f7f75bfbf96906e7585b0 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
@@ -3604,6 +3604,8 @@ extern "C" void QSlipDevComp27(unsigned int numberOfThreads,
 							   unsigned int* neighborX,
 							   unsigned int* neighborY,
 							   unsigned int* neighborZ,
+                        real* turbViscosity,
+                        bool useTurbViscosity,
 							   unsigned int size_Mat, 
 							   bool evenOrOdd)
 {
@@ -3621,7 +3623,24 @@ extern "C" void QSlipDevComp27(unsigned int numberOfThreads,
    }
    dim3 gridQ(Grid1, Grid2);
    dim3 threads(numberOfThreads, 1, 1 );
-
+   
+   if(useTurbViscosity)
+   {
+      QSlipDeviceComp27TurbViscosity<<< gridQ, threads >>> (DD, 
+											   k_Q, 
+											   QQ,
+											   sizeQ,
+											   om1, 
+											   neighborX,
+											   neighborY,
+											   neighborZ,
+                                    turbViscosity,
+											   size_Mat, 
+											   evenOrOdd);
+      getLastCudaError("QSlipDeviceComp27TurbViscosity execution failed");
+   }
+   else
+   {
       QSlipDeviceComp27<<< gridQ, threads >>> (DD, 
 											   k_Q, 
 											   QQ,
@@ -3632,7 +3651,8 @@ extern "C" void QSlipDevComp27(unsigned int numberOfThreads,
 											   neighborZ,
 											   size_Mat, 
 											   evenOrOdd);
-      getLastCudaError("QSlipDeviceComp27 execution failed"); 
+      getLastCudaError("QSlipDeviceComp27 execution failed");
+   }       
 }
 //////////////////////////////////////////////////////////////////////////
 extern "C" void QSlipGeomDevComp27(unsigned int numberOfThreads,
@@ -3727,6 +3747,167 @@ extern "C" void QSlipNormDevComp27(unsigned int numberOfThreads,
       getLastCudaError("QSlipGeomDeviceComp27 execution failed"); 
 }
 //////////////////////////////////////////////////////////////////////////
+extern "C" void QStressDevComp27(unsigned int numberOfThreads,
+							   real* DD, 
+							   int* k_Q, 
+                        int* k_N,
+							   real* QQ,
+							   unsigned int sizeQ,
+							   real om1,
+                        real* turbViscosity, 
+                        real* vx,
+                        real* vy,
+                        real* vz,
+                        real* normalX,
+                        real* normalY,
+                        real* normalZ,
+                        real* vx_bc,
+                        real* vy_bc,
+                        real* vz_bc,
+                        real* vx1,
+                        real* vy1,
+                        real* vz1,
+                        int* samplingOffset,
+                        real* z0,
+                        bool  hasWallModelMonitor,
+                        real* u_star,
+                        real* Fx,
+                        real* Fy,
+                        real* Fz,
+							   unsigned int* neighborX,
+							   unsigned int* neighborY,
+							   unsigned int* neighborZ,
+							   unsigned int size_Mat, 
+							   bool evenOrOdd)
+{
+   int Grid = (sizeQ / numberOfThreads)+1;
+   int Grid1, Grid2;
+   if (Grid>512)
+   {
+      Grid1 = 512;
+      Grid2 = (Grid/Grid1)+1;
+   } 
+   else
+   {
+      Grid1 = 1;
+      Grid2 = Grid;
+   }
+   dim3 gridQ(Grid1, Grid2);
+   dim3 threads(numberOfThreads, 1, 1 );
+   
+      QStressDeviceComp27<<< gridQ, threads >>> (DD, 
+											   k_Q,
+                                    k_N, 
+											   QQ,
+											   sizeQ,
+											   om1,
+                                    turbViscosity, 
+                                    vx,
+                                    vy,
+                                    vz,
+                                    normalX,
+                                    normalY,
+                                    normalZ,
+                                    vx_bc,
+                                    vy_bc,
+                                    vz_bc,
+                                    vx1,
+                                    vy1,
+                                    vz1,
+                                    samplingOffset,
+                                    z0,
+                                    hasWallModelMonitor,
+                                    u_star,
+                                    Fx,
+                                    Fy,
+                                    Fz,
+											   neighborX,
+											   neighborY,
+											   neighborZ,
+											   size_Mat, 
+											   evenOrOdd);
+      getLastCudaError("QSlipDeviceComp27 execution failed"); 
+}
+
+//////////////////////////////////////////////////////////////////////////
+extern "C" void BBStressDev27(unsigned int numberOfThreads,
+							   real* DD, 
+							   int* k_Q, 
+                        int* k_N,
+							   real* QQ,
+							   unsigned int sizeQ,
+                        real* vx,
+                        real* vy,
+                        real* vz,
+                        real* normalX,
+                        real* normalY,
+                        real* normalZ,
+                        real* vx_bc,
+                        real* vy_bc,
+                        real* vz_bc,
+                        real* vx1,
+                        real* vy1,
+                        real* vz1,
+                        int* samplingOffset,
+                        real* z0,
+                        bool  hasWallModelMonitor,
+                        real* u_star,
+                        real* Fx,
+                        real* Fy,
+                        real* Fz,
+							   unsigned int* neighborX,
+							   unsigned int* neighborY,
+							   unsigned int* neighborZ,
+							   unsigned int size_Mat, 
+							   bool evenOrOdd)
+{
+   int Grid = (sizeQ / numberOfThreads)+1;
+   int Grid1, Grid2;
+   if (Grid>512)
+   {
+      Grid1 = 512;
+      Grid2 = (Grid/Grid1)+1;
+   } 
+   else
+   {
+      Grid1 = 1;
+      Grid2 = Grid;
+   }
+   dim3 gridQ(Grid1, Grid2);
+   dim3 threads(numberOfThreads, 1, 1 );
+   
+   BBStressDevice27<<< gridQ, threads >>> (DD, 
+											   k_Q,
+                                    k_N, 
+											   QQ,
+											   sizeQ,
+                                    vx,
+                                    vy,
+                                    vz,
+                                    normalX,
+                                    normalY,
+                                    normalZ,
+                                    vx_bc,
+                                    vy_bc,
+                                    vz_bc,
+                                    vx1,
+                                    vy1,
+                                    vz1,
+                                    samplingOffset,
+                                    z0,
+                                    hasWallModelMonitor,
+                                    u_star,
+                                    Fx,
+                                    Fy,
+                                    Fz,
+											   neighborX,
+											   neighborY,
+											   neighborZ,
+											   size_Mat, 
+											   evenOrOdd);
+      getLastCudaError("BBStressDevice27 execution failed"); 
+}
+//////////////////////////////////////////////////////////////////////////
 extern "C" void QPressDev27(unsigned int numberOfThreads,
                              int nx,
                              int ny,
diff --git a/src/gpu/VirtualFluids_GPU/GPU/SlipBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/SlipBCs27.cu
index fc792a2c3a0f7438f4ee0882988a39f7260f21be..e5c017e6b1941f0a7b53e23c70698ccaf7a18987 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/SlipBCs27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/SlipBCs27.cu
@@ -803,6 +803,7 @@ extern "C" __global__ void QSlipDeviceComp27(real* DD,
       unsigned int kbne = kb;
       unsigned int ktne = KQK;
       unsigned int kbsw = neighborZ[ksw];
+      
       ////////////////////////////////////////////////////////////////////////////////
       real f_W    = (D.f[dirE   ])[ke   ];
       real f_E    = (D.f[dirW   ])[kw   ];
@@ -1076,6 +1077,7 @@ extern "C" __global__ void QSlipDeviceComp27(real* DD,
 	     VeloZ = fac*vx3;
 		 if (x == true) VeloX = c0o1;
 		 if (z == true) VeloZ = c0o1;
+      //  if (k==10000) printf("AFTER x: %u \t  y: %u \t z: %u \n  VeloX: %f \t VeloY: %f \t VeloZ: %f \n\n", x,y,z, VeloX,VeloY,VeloZ);
          feq=c1o54* (drho/*+three*( vx1    +vx3)*/+c9o2*( vx1    +vx3)*( vx1    +vx3) * (c1o1 + drho)-cu_sq); 
          (D.f[dirBW])[kbw]=(c1o1-q)/(c1o1+q)*(f_TE-f_BW+(f_TE+f_BW-c2o1*feq*om1)/(c1o1-om1))*c1o2+(q*(f_TE+f_BW)-c6o1*c1o54*( VeloX+VeloZ))/(c1o1+q) - c1o54 * drho;
          //feq=c1over54* (drho+three*( vx1    +vx3)+c9over2*( vx1    +vx3)*( vx1    +vx3)-cu_sq); 
@@ -1318,7 +1320,673 @@ extern "C" __global__ void QSlipDeviceComp27(real* DD,
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+extern "C" __global__ void QSlipDeviceComp27TurbViscosity(real* DD, 
+											 int* k_Q, 
+											 real* QQ,
+											 unsigned int sizeQ,
+											 real om1, 
+											 unsigned int* neighborX,
+											 unsigned int* neighborY,
+											 unsigned int* neighborZ,
+                                  real* turbViscosity,
+											 unsigned int size_Mat, 
+											 bool evenOrOdd)
+{
+   Distributions27 D;
+   if (evenOrOdd==true)
+   {
+      D.f[dirE   ] = &DD[dirE   *size_Mat];
+      D.f[dirW   ] = &DD[dirW   *size_Mat];
+      D.f[dirN   ] = &DD[dirN   *size_Mat];
+      D.f[dirS   ] = &DD[dirS   *size_Mat];
+      D.f[dirT   ] = &DD[dirT   *size_Mat];
+      D.f[dirB   ] = &DD[dirB   *size_Mat];
+      D.f[dirNE  ] = &DD[dirNE  *size_Mat];
+      D.f[dirSW  ] = &DD[dirSW  *size_Mat];
+      D.f[dirSE  ] = &DD[dirSE  *size_Mat];
+      D.f[dirNW  ] = &DD[dirNW  *size_Mat];
+      D.f[dirTE  ] = &DD[dirTE  *size_Mat];
+      D.f[dirBW  ] = &DD[dirBW  *size_Mat];
+      D.f[dirBE  ] = &DD[dirBE  *size_Mat];
+      D.f[dirTW  ] = &DD[dirTW  *size_Mat];
+      D.f[dirTN  ] = &DD[dirTN  *size_Mat];
+      D.f[dirBS  ] = &DD[dirBS  *size_Mat];
+      D.f[dirBN  ] = &DD[dirBN  *size_Mat];
+      D.f[dirTS  ] = &DD[dirTS  *size_Mat];
+      D.f[dirZERO] = &DD[dirZERO*size_Mat];
+      D.f[dirTNE ] = &DD[dirTNE *size_Mat];
+      D.f[dirTSW ] = &DD[dirTSW *size_Mat];
+      D.f[dirTSE ] = &DD[dirTSE *size_Mat];
+      D.f[dirTNW ] = &DD[dirTNW *size_Mat];
+      D.f[dirBNE ] = &DD[dirBNE *size_Mat];
+      D.f[dirBSW ] = &DD[dirBSW *size_Mat];
+      D.f[dirBSE ] = &DD[dirBSE *size_Mat];
+      D.f[dirBNW ] = &DD[dirBNW *size_Mat];
+   } 
+   else
+   {
+      D.f[dirW   ] = &DD[dirE   *size_Mat];
+      D.f[dirE   ] = &DD[dirW   *size_Mat];
+      D.f[dirS   ] = &DD[dirN   *size_Mat];
+      D.f[dirN   ] = &DD[dirS   *size_Mat];
+      D.f[dirB   ] = &DD[dirT   *size_Mat];
+      D.f[dirT   ] = &DD[dirB   *size_Mat];
+      D.f[dirSW  ] = &DD[dirNE  *size_Mat];
+      D.f[dirNE  ] = &DD[dirSW  *size_Mat];
+      D.f[dirNW  ] = &DD[dirSE  *size_Mat];
+      D.f[dirSE  ] = &DD[dirNW  *size_Mat];
+      D.f[dirBW  ] = &DD[dirTE  *size_Mat];
+      D.f[dirTE  ] = &DD[dirBW  *size_Mat];
+      D.f[dirTW  ] = &DD[dirBE  *size_Mat];
+      D.f[dirBE  ] = &DD[dirTW  *size_Mat];
+      D.f[dirBS  ] = &DD[dirTN  *size_Mat];
+      D.f[dirTN  ] = &DD[dirBS  *size_Mat];
+      D.f[dirTS  ] = &DD[dirBN  *size_Mat];
+      D.f[dirBN  ] = &DD[dirTS  *size_Mat];
+      D.f[dirZERO] = &DD[dirZERO*size_Mat];
+      D.f[dirTNE ] = &DD[dirBSW *size_Mat];
+      D.f[dirTSW ] = &DD[dirBNE *size_Mat];
+      D.f[dirTSE ] = &DD[dirBNW *size_Mat];
+      D.f[dirTNW ] = &DD[dirBSE *size_Mat];
+      D.f[dirBNE ] = &DD[dirTSW *size_Mat];
+      D.f[dirBSW ] = &DD[dirTNE *size_Mat];
+      D.f[dirBSE ] = &DD[dirTNW *size_Mat];
+      D.f[dirBNW ] = &DD[dirTSE *size_Mat];
+   }
+   ////////////////////////////////////////////////////////////////////////////////
+   const unsigned  x = threadIdx.x;  // Globaler x-Index 
+   const unsigned  y = blockIdx.x;   // Globaler y-Index 
+   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+
+   const unsigned nx = blockDim.x;
+   const unsigned ny = gridDim.x;
+
+   const unsigned k = nx*(ny*z + y) + x;
+   //////////////////////////////////////////////////////////////////////////
+
+   if(k<sizeQ)
+   {
+      ////////////////////////////////////////////////////////////////////////////////
+      real *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB, 
+            *q_dirNE,  *q_dirSW,  *q_dirSE,  *q_dirNW,  *q_dirTE,  *q_dirBW,
+            *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
+            *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
+            *q_dirBSE, *q_dirBNW; 
+      q_dirE   = &QQ[dirE   *sizeQ];
+      q_dirW   = &QQ[dirW   *sizeQ];
+      q_dirN   = &QQ[dirN   *sizeQ];
+      q_dirS   = &QQ[dirS   *sizeQ];
+      q_dirT   = &QQ[dirT   *sizeQ];
+      q_dirB   = &QQ[dirB   *sizeQ];
+      q_dirNE  = &QQ[dirNE  *sizeQ];
+      q_dirSW  = &QQ[dirSW  *sizeQ];
+      q_dirSE  = &QQ[dirSE  *sizeQ];
+      q_dirNW  = &QQ[dirNW  *sizeQ];
+      q_dirTE  = &QQ[dirTE  *sizeQ];
+      q_dirBW  = &QQ[dirBW  *sizeQ];
+      q_dirBE  = &QQ[dirBE  *sizeQ];
+      q_dirTW  = &QQ[dirTW  *sizeQ];
+      q_dirTN  = &QQ[dirTN  *sizeQ];
+      q_dirBS  = &QQ[dirBS  *sizeQ];
+      q_dirBN  = &QQ[dirBN  *sizeQ];
+      q_dirTS  = &QQ[dirTS  *sizeQ];
+      q_dirTNE = &QQ[dirTNE *sizeQ];
+      q_dirTSW = &QQ[dirTSW *sizeQ];
+      q_dirTSE = &QQ[dirTSE *sizeQ];
+      q_dirTNW = &QQ[dirTNW *sizeQ];
+      q_dirBNE = &QQ[dirBNE *sizeQ];
+      q_dirBSW = &QQ[dirBSW *sizeQ];
+      q_dirBSE = &QQ[dirBSE *sizeQ];
+      q_dirBNW = &QQ[dirBNW *sizeQ];
+      ////////////////////////////////////////////////////////////////////////////////
+      //index
+      unsigned int KQK  = k_Q[k];
+      unsigned int kzero= KQK;
+      unsigned int ke   = KQK;
+      unsigned int kw   = neighborX[KQK];
+      unsigned int kn   = KQK;
+      unsigned int ks   = neighborY[KQK];
+      unsigned int kt   = KQK;
+      unsigned int kb   = neighborZ[KQK];
+      unsigned int ksw  = neighborY[kw];
+      unsigned int kne  = KQK;
+      unsigned int kse  = ks;
+      unsigned int knw  = kw;
+      unsigned int kbw  = neighborZ[kw];
+      unsigned int kte  = KQK;
+      unsigned int kbe  = kb;
+      unsigned int ktw  = kw;
+      unsigned int kbs  = neighborZ[ks];
+      unsigned int ktn  = KQK;
+      unsigned int kbn  = kb;
+      unsigned int kts  = ks;
+      unsigned int ktse = ks;
+      unsigned int kbnw = kbw;
+      unsigned int ktnw = kw;
+      unsigned int kbse = kbs;
+      unsigned int ktsw = ksw;
+      unsigned int kbne = kb;
+      unsigned int ktne = KQK;
+      unsigned int kbsw = neighborZ[ksw];
+      
+      ////////////////////////////////////////////////////////////////////////////////
+      real f_W    = (D.f[dirE   ])[ke   ];
+      real f_E    = (D.f[dirW   ])[kw   ];
+      real f_S    = (D.f[dirN   ])[kn   ];
+      real f_N    = (D.f[dirS   ])[ks   ];
+      real f_B    = (D.f[dirT   ])[kt   ];
+      real f_T    = (D.f[dirB   ])[kb   ];
+      real f_SW   = (D.f[dirNE  ])[kne  ];
+      real f_NE   = (D.f[dirSW  ])[ksw  ];
+      real f_NW   = (D.f[dirSE  ])[kse  ];
+      real f_SE   = (D.f[dirNW  ])[knw  ];
+      real f_BW   = (D.f[dirTE  ])[kte  ];
+      real f_TE   = (D.f[dirBW  ])[kbw  ];
+      real f_TW   = (D.f[dirBE  ])[kbe  ];
+      real f_BE   = (D.f[dirTW  ])[ktw  ];
+      real f_BS   = (D.f[dirTN  ])[ktn  ];
+      real f_TN   = (D.f[dirBS  ])[kbs  ];
+      real f_TS   = (D.f[dirBN  ])[kbn  ];
+      real f_BN   = (D.f[dirTS  ])[kts  ];
+      real f_BSW  = (D.f[dirTNE ])[ktne ];
+      real f_BNE  = (D.f[dirTSW ])[ktsw ];
+      real f_BNW  = (D.f[dirTSE ])[ktse ];
+      real f_BSE  = (D.f[dirTNW ])[ktnw ];
+      real f_TSW  = (D.f[dirBNE ])[kbne ];
+      real f_TNE  = (D.f[dirBSW ])[kbsw ];
+      real f_TNW  = (D.f[dirBSE ])[kbse ];
+      real f_TSE  = (D.f[dirBNW ])[kbnw ];
+      ////////////////////////////////////////////////////////////////////////////////
+      real vx1, vx2, vx3, drho, feq, q;
+      drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
+                f_BN + f_TS + f_TN + f_BS + f_BE + f_TW + f_TE + f_BW + f_SE + f_NW + f_NE + f_SW + 
+                f_T + f_B + f_N + f_S + f_E + f_W + ((D.f[dirZERO])[kzero]); 
+
+      vx1    =  (((f_TSE - f_BNW) - (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
+                ((f_BE - f_TW)   + (f_TE - f_BW))   + ((f_SE - f_NW)   + (f_NE - f_SW)) +
+                (f_E - f_W)) / (c1o1 + drho); 
+         
+
+      vx2    =   ((-(f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
+                 ((f_BN - f_TS)   + (f_TN - f_BS))    + (-(f_SE - f_NW)  + (f_NE - f_SW)) +
+                 (f_N - f_S)) / (c1o1 + drho); 
+
+      vx3    =   (((f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) + (f_TSW - f_BNE)) +
+                 (-(f_BN - f_TS)  + (f_TN - f_BS))   + ((f_TE - f_BW)   - (f_BE - f_TW)) +
+                 (f_T - f_B)) / (c1o1 + drho); 
+
+      real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3) * (c1o1 + drho);
+
+      //////////////////////////////////////////////////////////////////////////
+      if (evenOrOdd==false)
+      {
+         D.f[dirE   ] = &DD[dirE   *size_Mat];
+         D.f[dirW   ] = &DD[dirW   *size_Mat];
+         D.f[dirN   ] = &DD[dirN   *size_Mat];
+         D.f[dirS   ] = &DD[dirS   *size_Mat];
+         D.f[dirT   ] = &DD[dirT   *size_Mat];
+         D.f[dirB   ] = &DD[dirB   *size_Mat];
+         D.f[dirNE  ] = &DD[dirNE  *size_Mat];
+         D.f[dirSW  ] = &DD[dirSW  *size_Mat];
+         D.f[dirSE  ] = &DD[dirSE  *size_Mat];
+         D.f[dirNW  ] = &DD[dirNW  *size_Mat];
+         D.f[dirTE  ] = &DD[dirTE  *size_Mat];
+         D.f[dirBW  ] = &DD[dirBW  *size_Mat];
+         D.f[dirBE  ] = &DD[dirBE  *size_Mat];
+         D.f[dirTW  ] = &DD[dirTW  *size_Mat];
+         D.f[dirTN  ] = &DD[dirTN  *size_Mat];
+         D.f[dirBS  ] = &DD[dirBS  *size_Mat];
+         D.f[dirBN  ] = &DD[dirBN  *size_Mat];
+         D.f[dirTS  ] = &DD[dirTS  *size_Mat];
+         D.f[dirZERO] = &DD[dirZERO*size_Mat];
+         D.f[dirTNE ] = &DD[dirTNE *size_Mat];
+         D.f[dirTSW ] = &DD[dirTSW *size_Mat];
+         D.f[dirTSE ] = &DD[dirTSE *size_Mat];
+         D.f[dirTNW ] = &DD[dirTNW *size_Mat];
+         D.f[dirBNE ] = &DD[dirBNE *size_Mat];
+         D.f[dirBSW ] = &DD[dirBSW *size_Mat];
+         D.f[dirBSE ] = &DD[dirBSE *size_Mat];
+         D.f[dirBNW ] = &DD[dirBNW *size_Mat];
+      } 
+      else
+      {
+         D.f[dirW   ] = &DD[dirE   *size_Mat];
+         D.f[dirE   ] = &DD[dirW   *size_Mat];
+         D.f[dirS   ] = &DD[dirN   *size_Mat];
+         D.f[dirN   ] = &DD[dirS   *size_Mat];
+         D.f[dirB   ] = &DD[dirT   *size_Mat];
+         D.f[dirT   ] = &DD[dirB   *size_Mat];
+         D.f[dirSW  ] = &DD[dirNE  *size_Mat];
+         D.f[dirNE  ] = &DD[dirSW  *size_Mat];
+         D.f[dirNW  ] = &DD[dirSE  *size_Mat];
+         D.f[dirSE  ] = &DD[dirNW  *size_Mat];
+         D.f[dirBW  ] = &DD[dirTE  *size_Mat];
+         D.f[dirTE  ] = &DD[dirBW  *size_Mat];
+         D.f[dirTW  ] = &DD[dirBE  *size_Mat];
+         D.f[dirBE  ] = &DD[dirTW  *size_Mat];
+         D.f[dirBS  ] = &DD[dirTN  *size_Mat];
+         D.f[dirTN  ] = &DD[dirBS  *size_Mat];
+         D.f[dirTS  ] = &DD[dirBN  *size_Mat];
+         D.f[dirBN  ] = &DD[dirTS  *size_Mat];
+         D.f[dirZERO] = &DD[dirZERO*size_Mat];
+         D.f[dirTNE ] = &DD[dirBSW *size_Mat];
+         D.f[dirTSW ] = &DD[dirBNE *size_Mat];
+         D.f[dirTSE ] = &DD[dirBNW *size_Mat];
+         D.f[dirTNW ] = &DD[dirBSE *size_Mat];
+         D.f[dirBNE ] = &DD[dirTSW *size_Mat];
+         D.f[dirBSW ] = &DD[dirTNE *size_Mat];
+         D.f[dirBSE ] = &DD[dirTNW *size_Mat];
+         D.f[dirBNW ] = &DD[dirTSE *size_Mat];
+      }
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      //Test
+      //(D.f[dirZERO])[k]=c1o10;
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+	  real om_turb = om1 / (c1o1 + c3o1*om1*max(c0o1, turbViscosity[k_Q[k]]));
+     
+     real fac = c1o1;//c99o100;
+	  real VeloX = fac*vx1;
+	  real VeloY = fac*vx2;
+	  real VeloZ = fac*vx3;
+	  bool x = false;
+	  bool y = false;
+	  bool z = false;
+
+      q = q_dirE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = c0o1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 x = true;
+         feq=c2o27* (drho/*+three*( vx1        )*/+c9o2*( vx1        )*( vx1        ) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirW])[kw]=(c1o1-q)/(c1o1+q)*(f_E-f_W+(f_E+f_W-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_E+f_W)-c6o1*c2o27*( VeloX     ))/(c1o1+q) - c2o27 * drho;
+         //feq=c2over27* (drho+three*( vx1        )+c9over2*( vx1        )*( vx1        )-cu_sq); 
+         //(D.f[dirW])[kw]=(one-q)/(one+q)*(f_E-feq*om1)/(one-om1)+(q*(f_E+f_W)-six*c2over27*( VeloX     ))/(one+q);
+         //(D.f[dirW])[kw]=zero;
+      }
+
+      q = q_dirW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = c0o1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 x = true;
+         feq=c2o27* (drho/*+three*(-vx1        )*/+c9o2*(-vx1        )*(-vx1        ) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirE])[ke]=(c1o1-q)/(c1o1+q)*(f_W-f_E+(f_W+f_E-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_W+f_E)-c6o1*c2o27*(-VeloX     ))/(c1o1+q) - c2o27 * drho;
+         //feq=c2over27* (drho+three*(-vx1        )+c9over2*(-vx1        )*(-vx1        )-cu_sq); 
+         //(D.f[dirE])[ke]=(one-q)/(one+q)*(f_W-feq*om_turb)/(one-om_turb)+(q*(f_W+f_E)-six*c2over27*(-VeloX     ))/(one+q);
+         //(D.f[dirE])[ke]=zero;
+      }
+
+      q = q_dirN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+		 VeloY = c0o1;
+	     VeloZ = fac*vx3;
+		 y = true;
+         feq=c2o27* (drho/*+three*(    vx2     )*/+c9o2*(     vx2    )*(     vx2    ) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirS])[ks]=(c1o1-q)/(c1o1+q)*(f_N-f_S+(f_N+f_S-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_N+f_S)-c6o1*c2o27*( VeloY     ))/(c1o1+q) - c2o27 * drho;
+         //feq=c2over27* (drho+three*(    vx2     )+c9over2*(     vx2    )*(     vx2    )-cu_sq); 
+         //(D.f[dirS])[ks]=(one-q)/(one+q)*(f_N-feq*om_turb)/(one-om_turb)+(q*(f_N+f_S)-six*c2over27*( VeloY     ))/(one+q);
+         //(D.f[dirS])[ks]=zero;
+      }
+
+      q = q_dirS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+		 VeloY = c0o1;
+	     VeloZ = fac*vx3;
+		 y = true;
+         feq=c2o27* (drho/*+three*(   -vx2     )*/+c9o2*(    -vx2    )*(    -vx2    ) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirN])[kn]=(c1o1-q)/(c1o1+q)*(f_S-f_N+(f_S+f_N-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_S+f_N)-c6o1*c2o27*(-VeloY     ))/(c1o1+q) - c2o27 * drho;
+         //feq=c2over27* (drho+three*(   -vx2     )+c9over2*(    -vx2    )*(    -vx2    )-cu_sq); 
+         //(D.f[dirN])[kn]=(one-q)/(one+q)*(f_S-feq*om_turb)/(one-om_turb)+(q*(f_S+f_N)-six*c2over27*(-VeloY     ))/(one+q);
+         //(D.f[dirN])[kn]=zero;
+      }
+
+      q = q_dirT[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+		 VeloZ = c0o1;
+		 z = true;
+         feq=c2o27* (drho/*+three*(         vx3)*/+c9o2*(         vx3)*(         vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirB])[kb]=(c1o1-q)/(c1o1+q)*(f_T-f_B+(f_T+f_B-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_T+f_B)-c6o1*c2o27*( VeloZ     ))/(c1o1+q) - c2o27 * drho;
+         //feq=c2over27* (drho+three*(         vx3)+c9over2*(         vx3)*(         vx3)-cu_sq); 
+         //(D.f[dirB])[kb]=(one-q)/(one+q)*(f_T-feq*om_turb)/(one-om_turb)+(q*(f_T+f_B)-six*c2over27*( VeloZ     ))/(one+q);
+         //(D.f[dirB])[kb]=one;
+      }
+
+      q = q_dirB[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+		 VeloZ = c0o1;
+		 z = true;
+         feq=c2o27* (drho/*+three*(        -vx3)*/+c9o2*(        -vx3)*(        -vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirT])[kt]=(c1o1-q)/(c1o1+q)*(f_B-f_T+(f_B+f_T-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_B+f_T)-c6o1*c2o27*(-VeloZ     ))/(c1o1+q) - c2o27 * drho;
+         //feq=c2over27* (drho+three*(        -vx3)+c9over2*(        -vx3)*(        -vx3)-cu_sq); 
+         //(D.f[dirT])[kt]=(one-q)/(one+q)*(f_B-feq*om_turb)/(one-om_turb)+(q*(f_B+f_T)-six*c2over27*(-VeloZ     ))/(one+q);
+         //(D.f[dirT])[kt]=zero;
+      }
+
+      q = q_dirNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (y == true) VeloY = c0o1;
+         feq=c1o54* (drho/*+three*( vx1+vx2    )*/+c9o2*( vx1+vx2    )*( vx1+vx2    ) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirSW])[ksw]=(c1o1-q)/(c1o1+q)*(f_NE-f_SW+(f_NE+f_SW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_NE+f_SW)-c6o1*c1o54*(VeloX+VeloY))/(c1o1+q) - c1o54 * drho;
+         //feq=c1over54* (drho+three*( vx1+vx2    )+c9over2*( vx1+vx2    )*( vx1+vx2    )-cu_sq); 
+         //(D.f[dirSW])[ksw]=(one-q)/(one+q)*(f_NE-feq*om_turb)/(one-om_turb)+(q*(f_NE+f_SW)-six*c1over54*(VeloX+VeloY))/(one+q);
+         //(D.f[dirSW])[ksw]=zero;
+      }
+
+      q = q_dirSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (y == true) VeloY = c0o1;
+         feq=c1o54* (drho/*+three*(-vx1-vx2    )*/+c9o2*(-vx1-vx2    )*(-vx1-vx2    ) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirNE])[kne]=(c1o1-q)/(c1o1+q)*(f_SW-f_NE+(f_SW+f_NE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_SW+f_NE)-c6o1*c1o54*(-VeloX-VeloY))/(c1o1+q) - c1o54 * drho;
+         //feq=c1over54* (drho+three*(-vx1-vx2    )+c9over2*(-vx1-vx2    )*(-vx1-vx2    )-cu_sq); 
+         //(D.f[dirNE])[kne]=(one-q)/(one+q)*(f_SW-feq*om_turb)/(one-om_turb)+(q*(f_SW+f_NE)-six*c1over54*(-VeloX-VeloY))/(one+q);
+         //(D.f[dirNE])[kne]=zero;
+      }
+
+      q = q_dirSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (y == true) VeloY = c0o1;
+         feq=c1o54* (drho/*+three*( vx1-vx2    )*/+c9o2*( vx1-vx2    )*( vx1-vx2    ) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirNW])[knw]=(c1o1-q)/(c1o1+q)*(f_SE-f_NW+(f_SE+f_NW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_SE+f_NW)-c6o1*c1o54*( VeloX-VeloY))/(c1o1+q) - c1o54 * drho;
+         //feq=c1over54* (drho+three*( vx1-vx2    )+c9over2*( vx1-vx2    )*( vx1-vx2    )-cu_sq); 
+         //(D.f[dirNW])[knw]=(one-q)/(one+q)*(f_SE-feq*om_turb)/(one-om_turb)+(q*(f_SE+f_NW)-six*c1over54*( VeloX-VeloY))/(one+q);
+         //(D.f[dirNW])[knw]=zero;
+      }
+
+      q = q_dirNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (y == true) VeloY = c0o1;
+         feq=c1o54* (drho/*+three*(-vx1+vx2    )*/+c9o2*(-vx1+vx2    )*(-vx1+vx2    ) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirSE])[kse]=(c1o1-q)/(c1o1+q)*(f_NW-f_SE+(f_NW+f_SE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_NW+f_SE)-c6o1*c1o54*(-VeloX+VeloY))/(c1o1+q) - c1o54 * drho;
+         //feq=c1over54* (drho+three*(-vx1+vx2    )+c9over2*(-vx1+vx2    )*(-vx1+vx2    )-cu_sq); 
+         //(D.f[dirSE])[kse]=(one-q)/(one+q)*(f_NW-feq*om_turb)/(one-om_turb)+(q*(f_NW+f_SE)-six*c1over54*(-VeloX+VeloY))/(one+q);
+         //(D.f[dirSE])[kse]=zero;
+      }
+
+      q = q_dirTE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (z == true) VeloZ = c0o1;
+      //  if (k==10000) printf("AFTER x: %u \t  y: %u \t z: %u \n  VeloX: %f \t VeloY: %f \t VeloZ: %f \n\n", x,y,z, VeloX,VeloY,VeloZ);
+         feq=c1o54* (drho/*+three*( vx1    +vx3)*/+c9o2*( vx1    +vx3)*( vx1    +vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirBW])[kbw]=(c1o1-q)/(c1o1+q)*(f_TE-f_BW+(f_TE+f_BW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TE+f_BW)-c6o1*c1o54*( VeloX+VeloZ))/(c1o1+q) - c1o54 * drho;
+         //feq=c1over54* (drho+three*( vx1    +vx3)+c9over2*( vx1    +vx3)*( vx1    +vx3)-cu_sq); 
+         //(D.f[dirBW])[kbw]=(one-q)/(one+q)*(f_TE-feq*om_turb)/(one-om_turb)+(q*(f_TE+f_BW)-six*c1over54*( VeloX+VeloZ))/(one+q);
+         //(D.f[dirBW])[kbw]=zero;
+      }
+
+      q = q_dirBW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o54* (drho/*+three*(-vx1    -vx3)*/+c9o2*(-vx1    -vx3)*(-vx1    -vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirTE])[kte]=(c1o1-q)/(c1o1+q)*(f_BW-f_TE+(f_BW+f_TE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BW+f_TE)-c6o1*c1o54*(-VeloX-VeloZ))/(c1o1+q) - c1o54 * drho;
+         //feq=c1over54* (drho+three*(-vx1    -vx3)+c9over2*(-vx1    -vx3)*(-vx1    -vx3)-cu_sq); 
+         //(D.f[dirTE])[kte]=(one-q)/(one+q)*(f_BW-feq*om_turb)/(one-om_turb)+(q*(f_BW+f_TE)-six*c1over54*(-VeloX-VeloZ))/(one+q);
+         //(D.f[dirTE])[kte]=zero;
+      }
+
+      q = q_dirBE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o54* (drho/*+three*( vx1    -vx3)*/+c9o2*( vx1    -vx3)*( vx1    -vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirTW])[ktw]=(c1o1-q)/(c1o1+q)*(f_BE-f_TW+(f_BE+f_TW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BE+f_TW)-c6o1*c1o54*( VeloX-VeloZ))/(c1o1+q) - c1o54 * drho;
+         //feq=c1over54* (drho+three*( vx1    -vx3)+c9over2*( vx1    -vx3)*( vx1    -vx3)-cu_sq); 
+         //(D.f[dirTW])[ktw]=(one-q)/(one+q)*(f_BE-feq*om_turb)/(one-om_turb)+(q*(f_BE+f_TW)-six*c1over54*( VeloX-VeloZ))/(one+q);
+         //(D.f[dirTW])[ktw]=zero;
+      }
+
+      q = q_dirTW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o54* (drho/*+three*(-vx1    +vx3)*/+c9o2*(-vx1    +vx3)*(-vx1    +vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirBE])[kbe]=(c1o1-q)/(c1o1+q)*(f_TW-f_BE+(f_TW+f_BE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TW+f_BE)-c6o1*c1o54*(-VeloX+VeloZ))/(c1o1+q) - c1o54 * drho;
+         //feq=c1over54* (drho+three*(-vx1    +vx3)+c9over2*(-vx1    +vx3)*(-vx1    +vx3)-cu_sq); 
+         //(D.f[dirBE])[kbe]=(one-q)/(one+q)*(f_TW-feq*om_turb)/(one-om_turb)+(q*(f_TW+f_BE)-six*c1over54*(-VeloX+VeloZ))/(one+q);
+         //(D.f[dirBE])[kbe]=zero;
+      }
+
+      q = q_dirTN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (y == true) VeloY = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o54* (drho/*+three*(     vx2+vx3)*/+c9o2*(     vx2+vx3)*(     vx2+vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirBS])[kbs]=(c1o1-q)/(c1o1+q)*(f_TN-f_BS+(f_TN+f_BS-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TN+f_BS)-c6o1*c1o54*( VeloY+VeloZ))/(c1o1+q) - c1o54 * drho;
+         //feq=c1over54* (drho+three*(     vx2+vx3)+c9over2*(     vx2+vx3)*(     vx2+vx3)-cu_sq); 
+         //(D.f[dirBS])[kbs]=(one-q)/(one+q)*(f_TN-feq*om_turb)/(one-om_turb)+(q*(f_TN+f_BS)-six*c1over54*( VeloY+VeloZ))/(one+q);
+         //(D.f[dirBS])[kbs]=zero;
+      }
+
+      q = q_dirBS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (y == true) VeloY = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o54* (drho/*+three*(    -vx2-vx3)*/+c9o2*(    -vx2-vx3)*(    -vx2-vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirTN])[ktn]=(c1o1-q)/(c1o1+q)*(f_BS-f_TN+(f_BS+f_TN-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BS+f_TN)-c6o1*c1o54*( -VeloY-VeloZ))/(c1o1+q) - c1o54 * drho;
+         //feq=c1over54* (drho+three*(    -vx2-vx3)+c9over2*(    -vx2-vx3)*(    -vx2-vx3)-cu_sq); 
+         //(D.f[dirTN])[ktn]=(one-q)/(one+q)*(f_BS-feq*om_turb)/(one-om_turb)+(q*(f_BS+f_TN)-six*c1over54*( -VeloY-VeloZ))/(one+q);
+         //(D.f[dirTN])[ktn]=zero;
+      }
+
+      q = q_dirBN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (y == true) VeloY = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o54* (drho/*+three*(     vx2-vx3)*/+c9o2*(     vx2-vx3)*(     vx2-vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirTS])[kts]=(c1o1-q)/(c1o1+q)*(f_BN-f_TS+(f_BN+f_TS-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BN+f_TS)-c6o1*c1o54*( VeloY-VeloZ))/(c1o1+q) - c1o54 * drho;
+         //feq=c1over54* (drho+three*(     vx2-vx3)+c9over2*(     vx2-vx3)*(     vx2-vx3)-cu_sq); 
+         //(D.f[dirTS])[kts]=(one-q)/(one+q)*(f_BN-feq*om_turb)/(one-om_turb)+(q*(f_BN+f_TS)-six*c1over54*( VeloY-VeloZ))/(one+q);
+         //(D.f[dirTS])[kts]=zero;
+      }
+
+      q = q_dirTS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (y == true) VeloY = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o54* (drho/*+three*(    -vx2+vx3)*/+c9o2*(    -vx2+vx3)*(    -vx2+vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirBN])[kbn]=(c1o1-q)/(c1o1+q)*(f_TS-f_BN+(f_TS+f_BN-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TS+f_BN)-c6o1*c1o54*( -VeloY+VeloZ))/(c1o1+q) - c1o54 * drho;
+         //feq=c1over54* (drho+three*(    -vx2+vx3)+c9over2*(    -vx2+vx3)*(    -vx2+vx3)-cu_sq); 
+         //(D.f[dirBN])[kbn]=(one-q)/(one+q)*(f_TS-feq*om_turb)/(one-om_turb)+(q*(f_TS+f_BN)-six*c1over54*( -VeloY+VeloZ))/(one+q);
+         //(D.f[dirBN])[kbn]=zero;
+      }
+
+      q = q_dirTNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (y == true) VeloY = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o216*(drho/*+three*( vx1+vx2+vx3)*/+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirBSW])[kbsw]=(c1o1-q)/(c1o1+q)*(f_TNE-f_BSW+(f_TNE+f_BSW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TNE+f_BSW)-c6o1*c1o216*( VeloX+VeloY+VeloZ))/(c1o1+q) - c1o216 * drho;
+         //feq=c1over216*(drho+three*( vx1+vx2+vx3)+c9over2*( vx1+vx2+vx3)*( vx1+vx2+vx3)-cu_sq); 
+         //(D.f[dirBSW])[kbsw]=(one-q)/(one+q)*(f_TNE-feq*om_turb)/(one-om_turb)+(q*(f_TNE+f_BSW)-six*c1over216*( VeloX+VeloY+VeloZ))/(one+q);
+         //(D.f[dirBSW])[kbsw]=zero;
+      }
+
+      q = q_dirBSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (y == true) VeloY = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o216*(drho/*+three*(-vx1-vx2-vx3)*/+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirTNE])[ktne]=(c1o1-q)/(c1o1+q)*(f_BSW-f_TNE+(f_BSW+f_TNE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BSW+f_TNE)-c6o1*c1o216*(-VeloX-VeloY-VeloZ))/(c1o1+q) - c1o216 * drho;
+         //feq=c1over216*(drho+three*(-vx1-vx2-vx3)+c9over2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3)-cu_sq); 
+         //(D.f[dirTNE])[ktne]=(one-q)/(one+q)*(f_BSW-feq*om_turb)/(one-om_turb)+(q*(f_BSW+f_TNE)-six*c1over216*(-VeloX-VeloY-VeloZ))/(one+q);
+         //(D.f[dirTNE])[ktne]=zero;
+      }
+
+      q = q_dirBNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (y == true) VeloY = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o216*(drho/*+three*( vx1+vx2-vx3)*/+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirTSW])[ktsw]=(c1o1-q)/(c1o1+q)*(f_BNE-f_TSW+(f_BNE+f_TSW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BNE+f_TSW)-c6o1*c1o216*( VeloX+VeloY-VeloZ))/(c1o1+q) - c1o216 * drho;
+         //feq=c1over216*(drho+three*( vx1+vx2-vx3)+c9over2*( vx1+vx2-vx3)*( vx1+vx2-vx3)-cu_sq); 
+         //(D.f[dirTSW])[ktsw]=(one-q)/(one+q)*(f_BNE-feq*om_turb)/(one-om_turb)+(q*(f_BNE+f_TSW)-six*c1over216*( VeloX+VeloY-VeloZ))/(one+q);
+         //(D.f[dirTSW])[ktsw]=zero;
+      }
+
+      q = q_dirTSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (y == true) VeloY = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o216*(drho/*+three*(-vx1-vx2+vx3)*/+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirBNE])[kbne]=(c1o1-q)/(c1o1+q)*(f_TSW-f_BNE+(f_TSW+f_BNE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TSW+f_BNE)-c6o1*c1o216*(-VeloX-VeloY+VeloZ))/(c1o1+q) - c1o216 * drho;
+         //feq=c1over216*(drho+three*(-vx1-vx2+vx3)+c9over2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3)-cu_sq); 
+         //(D.f[dirBNE])[kbne]=(one-q)/(one+q)*(f_TSW-feq*om_turb)/(one-om_turb)+(q*(f_TSW+f_BNE)-six*c1over216*(-VeloX-VeloY+VeloZ))/(one+q);
+         //(D.f[dirBNE])[kbne]=zero;
+      }
+
+      q = q_dirTSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (y == true) VeloY = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o216*(drho/*+three*( vx1-vx2+vx3)*/+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirBNW])[kbnw]=(c1o1-q)/(c1o1+q)*(f_TSE-f_BNW+(f_TSE+f_BNW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TSE+f_BNW)-c6o1*c1o216*( VeloX-VeloY+VeloZ))/(c1o1+q) - c1o216 * drho;
+         //feq=c1over216*(drho+three*( vx1-vx2+vx3)+c9over2*( vx1-vx2+vx3)*( vx1-vx2+vx3)-cu_sq); 
+         //(D.f[dirBNW])[kbnw]=(one-q)/(one+q)*(f_TSE-feq*om_turb)/(one-om_turb)+(q*(f_TSE+f_BNW)-six*c1over216*( VeloX-VeloY+VeloZ))/(one+q);
+         //(D.f[dirBNW])[kbnw]=zero;
+      }
+
+      q = q_dirBNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (y == true) VeloY = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o216*(drho/*+three*(-vx1+vx2-vx3)*/+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirTSE])[ktse]=(c1o1-q)/(c1o1+q)*(f_BNW-f_TSE+(f_BNW+f_TSE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BNW+f_TSE)-c6o1*c1o216*(-VeloX+VeloY-VeloZ))/(c1o1+q) - c1o216 * drho;
+         //feq=c1over216*(drho+three*(-vx1+vx2-vx3)+c9over2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3)-cu_sq); 
+         //(D.f[dirTSE])[ktse]=(one-q)/(one+q)*(f_BNW-feq*om_turb)/(one-om_turb)+(q*(f_BNW+f_TSE)-six*c1over216*(-VeloX+VeloY-VeloZ))/(one+q);
+         //(D.f[dirTSE])[ktse]=zero;
+      }
+
+      q = q_dirBSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (y == true) VeloY = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o216*(drho/*+three*( vx1-vx2-vx3)*/+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirTNW])[ktnw]=(c1o1-q)/(c1o1+q)*(f_BSE-f_TNW+(f_BSE+f_TNW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BSE+f_TNW)-c6o1*c1o216*( VeloX-VeloY-VeloZ))/(c1o1+q) - c1o216 * drho;
+         //feq=c1over216*(drho+three*( vx1-vx2-vx3)+c9over2*( vx1-vx2-vx3)*( vx1-vx2-vx3)-cu_sq); 
+         //(D.f[dirTNW])[ktnw]=(one-q)/(one+q)*(f_BSE-feq*om_turb)/(one-om_turb)+(q*(f_BSE+f_TNW)-six*c1over216*( VeloX-VeloY-VeloZ))/(one+q);
+         //(D.f[dirTNW])[ktnw]=zero;
+      }
 
+      q = q_dirTNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+		 VeloX = fac*vx1;
+	     VeloY = fac*vx2;
+	     VeloZ = fac*vx3;
+		 if (x == true) VeloX = c0o1;
+		 if (y == true) VeloY = c0o1;
+		 if (z == true) VeloZ = c0o1;
+         feq=c1o216*(drho/*+three*(-vx1+vx2+vx3)*/+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3) * (c1o1 + drho)-cu_sq); 
+         (D.f[dirBSE])[kbse]=(c1o1-q)/(c1o1+q)*(f_TNW-f_BSE+(f_TNW+f_BSE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TNW+f_BSE)-c6o1*c1o216*(-VeloX+VeloY+VeloZ))/(c1o1+q) - c1o216 * drho;
+         //feq=c1over216*(drho+three*(-vx1+vx2+vx3)+c9over2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3)-cu_sq); 
+         //(D.f[dirBSE])[kbse]=(one-q)/(one+q)*(f_TNW-feq*om_turb)/(one-om_turb)+(q*(f_TNW+f_BSE)-six*c1over216*(-VeloX+VeloY+VeloZ))/(one+q);
+         //(D.f[dirBSE])[kbse]=zero;
+      }
+   }
+}
 
 
 
@@ -3109,48 +3777,3 @@ extern "C" __global__ void QSlipNormDeviceComp27(real* DD,
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
diff --git a/src/gpu/VirtualFluids_GPU/GPU/StressBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/StressBCs27.cu
new file mode 100644
index 0000000000000000000000000000000000000000..99efb964dc1bdb3b64ce799c9c9e9f2c2abcf866
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/GPU/StressBCs27.cu
@@ -0,0 +1,1661 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file StressBcs27.cu
+//! \author Henrik Asmuth
+//! \date 16/05/2022
+//! \brief Kernels for StressBC using the iMEM approach
+//!
+//! Both kernels prescribe a wall shear stress using the iMEM apprach (see, Asmuth et. al (2021), https://doi.org/10.1063/5.0065701)
+//! QStressDeviceComp27 couples the iMEM to the single-node interpolated bounce-back.
+//! BBStressDevice27 couples the iMEM to a simple bounce-back.
+//! Note, that the iMEM function is currently only implemented for straight walls with z-normal and q=0.5.
+//! Other wall models could be implemented in the iMEM by replacing the formulations from Monin-Obukhov similarity theory (MOST)
+//! with other formulations, e.g., for smooth walls. 
+//! iMEM so far most extensively tested with BBStressDevice27, but QStressDeviceComp27 also seems to be stable and working.
+//=======================================================================================
+
+#include "LBM/LB.h" 
+#include "LBM/D3Q27.h"
+#include <lbm/constants/NumericConstants.h>
+
+using namespace vf::lbm::constant;
+
+//////////////////////////////////////////////////////////////////////////////
+extern "C" __host__ __device__ __forceinline__ void iMEM(uint k, uint kN,
+                                                         real* _wallNormalX, real* _wallNormalY, real* _wallNormalZ,
+                                                         real* vx, real* vy, real* vz,
+                                                         real* vx_el,      real* vy_el,      real* vz_el,      //!>mean (temporally filtered) velocities at exchange location
+                                                         real* vx_w_mean,  real* vy_w_mean,  real* vz_w_mean,  //!>mean (temporally filtered) velocities at wall-adjactent node
+                                                         real  vx_w_inst,  real  vy_w_inst,  real  vz_w_inst,  //!>instantaneous velocities at wall-adjactent node
+                                                         real  rho,
+                                                         int* samplingOffset,
+                                                         real q,
+                                                         real forceFactor,                                     //!>e.g., 1.0 for simple-bounce back, or (1+q) for interpolated single-node bounce-back as in Geier et al (2015)
+                                                         real eps,                                             //!>filter constant in temporal averaging
+                                                         real* z0,                                             //!>aerodynamic roughness length
+                                                         bool  hasWallModelMonitor,
+                                                         real* u_star_monitor,
+                                                         real wallMomentumX, real wallMomentumY, real wallMomentumZ,
+                                                         real& wallVelocityX, real& wallVelocityY, real&wallVelocityZ)
+{
+      real wallNormalX = _wallNormalX[k];
+      real wallNormalY = _wallNormalY[k];
+      real wallNormalZ = _wallNormalZ[k];
+
+      //Sample velocity at exchange location and filter temporally
+      real _vx_el = eps*vx[kN]+(1.0-eps)*vx_el[k];
+      real _vy_el = eps*vy[kN]+(1.0-eps)*vy_el[k];
+      real _vz_el = eps*vz[kN]+(1.0-eps)*vz_el[k];
+      vx_el[k] = _vx_el;
+      vy_el[k] = _vy_el;
+      vz_el[k] = _vz_el;
+
+      //filter velocity at wall-adjacent node
+      real _vx_w_mean = eps*vx_w_inst+(1.0-eps)*vx_w_mean[k];
+      real _vy_w_mean = eps*vy_w_inst+(1.0-eps)*vy_w_mean[k];
+      real _vz_w_mean = eps*vz_w_inst+(1.0-eps)*vz_w_mean[k];
+      vx_w_mean[k] = _vx_w_mean;
+      vy_w_mean[k] = _vy_w_mean;
+      vz_w_mean[k] = _vz_w_mean;
+
+      //Subtract wall-normal velocity components
+      real vDotN_el = _vx_el*wallNormalX + _vy_el*wallNormalY + _vz_el*wallNormalZ;
+      _vx_el -= vDotN_el*wallNormalX;
+      _vy_el -= vDotN_el*wallNormalY;
+      _vz_el -= vDotN_el*wallNormalZ;
+      real vMag_el = sqrt( _vx_el*_vx_el + _vy_el*_vy_el + _vz_el*_vz_el );
+
+      real vDotN_w_mean = _vx_w_mean*wallNormalX + _vy_w_mean*wallNormalY + _vz_w_mean*wallNormalZ;
+      _vx_w_mean -= vDotN_w_mean*wallNormalX;
+      _vy_w_mean -= vDotN_w_mean*wallNormalY;
+      _vz_w_mean -= vDotN_w_mean*wallNormalZ;
+      real vMag_w_mean = sqrt( _vx_w_mean*_vx_w_mean + _vy_w_mean*_vy_w_mean + _vz_w_mean*_vz_w_mean );
+
+      real vDotN_w = vx_w_inst*wallNormalX + vy_w_inst*wallNormalY + vz_w_inst*wallNormalZ;
+      real _vx_w = vx_w_inst-vDotN_w*wallNormalX;
+      real _vy_w = vy_w_inst-vDotN_w*wallNormalY;
+      real _vz_w = vz_w_inst-vDotN_w*wallNormalZ;
+            
+      //Compute wall shear stress tau_w via MOST
+      real z = (real)samplingOffset[k] + 0.5; //assuming q=0.5, could be replaced by wall distance via wall normal
+      real kappa = 0.4;
+      real u_star = vMag_el*kappa/(log(z/z0[k]));
+      if(hasWallModelMonitor) u_star_monitor[k] = u_star;
+      real tau_w = u_star*u_star;                  //Note: this is actually tau_w/rho
+      real A = 1.0;                                //wall area (obviously 1 for grid aligned walls, can come from grid builder later for complex geometries)
+      
+      //Scale wall shear stress with near wall velocity, i.e., Schumann-Grötzbach (SG) approach
+      real F_w_x = (tau_w*A) * (_vx_w/vMag_w_mean);//(_vx_el/vMag_el) 
+      real F_w_y = (tau_w*A) * (_vy_w/vMag_w_mean);//(_vy_el/vMag_el)
+      real F_w_z = (tau_w*A) * (_vz_w/vMag_w_mean);//(_vz_el/vMag_el)
+      //                                                ^^^^^^^^^^^^--- old alternative: do not scale SG-like but only set direction via velocity at exchange location
+      
+      //Momentum to be applied via wall velocity 
+      real wallMomDotN = wallMomentumX*wallNormalX+wallMomentumY*wallNormalY+wallMomentumZ*wallNormalZ;
+      real F_x =  F_w_x - ( wallMomentumX - wallMomDotN*wallNormalX )/rho;
+      real F_y =  F_w_y - ( wallMomentumY - wallMomDotN*wallNormalY )/rho;
+      real F_z =  F_w_z - ( wallMomentumZ - wallMomDotN*wallNormalZ )/rho;
+
+      //Compute  wall velocity and clip (clipping only necessary for initial boundary layer development)
+      real clipWallVelo = 2.0;
+      real clipVx = clipWallVelo*_vx_el;
+      real clipVy = clipWallVelo*_vy_el;
+      real clipVz = clipWallVelo*_vz_el;
+
+      wallVelocityX = clipVx > -clipVx? min(clipVx, max(-clipVx, -3.0*F_x*forceFactor)): max(clipVx, min(-clipVx, -3.0*F_x*forceFactor));
+      wallVelocityY = clipVy > -clipVy? min(clipVy, max(-clipVy, -3.0*F_y*forceFactor)): max(clipVy, min(-clipVy, -3.0*F_y*forceFactor));
+      wallVelocityZ = clipVz > -clipVz? min(clipVz, max(-clipVz, -3.0*F_z*forceFactor)): max(clipVz, min(-clipVz, -3.0*F_z*forceFactor));
+}
+
+//////////////////////////////////////////////////////////////////////////////
+extern "C" __global__ void QStressDeviceComp27(real* DD, 
+											   int* k_Q, 
+                                    int* k_N, 
+											   real* QQ,
+                                    unsigned int sizeQ,
+                                    real om1, 
+                                    real* turbViscosity,
+                                    real* vx,
+                                    real* vy,
+                                    real* vz,
+                                    real* normalX,
+                                    real* normalY,
+                                    real* normalZ,
+                                    real* vx_el,
+                                    real* vy_el,
+                                    real* vz_el,
+                                    real* vx_w_mean,
+                                    real* vy_w_mean,
+                                    real* vz_w_mean,
+                                    int* samplingOffset,
+                                    real* z0,
+                                    bool  hasWallModelMonitor,
+                                    real* u_star_monitor,
+                                    real* Fx_monitor,
+                                    real* Fy_monitor,
+                                    real* Fz_monitor,
+											   unsigned int* neighborX,
+                                    unsigned int* neighborY,
+                                    unsigned int* neighborZ,
+                                    unsigned int size_Mat, 
+                                    bool evenOrOdd)
+{
+
+   bool printOut = false;
+
+   Distributions27 D;
+   if (evenOrOdd==true)//get right array of post coll f's
+   {
+      D.f[dirE   ] = &DD[dirE   *size_Mat];
+      D.f[dirW   ] = &DD[dirW   *size_Mat];
+      D.f[dirN   ] = &DD[dirN   *size_Mat];
+      D.f[dirS   ] = &DD[dirS   *size_Mat];
+      D.f[dirT   ] = &DD[dirT   *size_Mat];
+      D.f[dirB   ] = &DD[dirB   *size_Mat];
+      D.f[dirNE  ] = &DD[dirNE  *size_Mat];
+      D.f[dirSW  ] = &DD[dirSW  *size_Mat];
+      D.f[dirSE  ] = &DD[dirSE  *size_Mat];
+      D.f[dirNW  ] = &DD[dirNW  *size_Mat];
+      D.f[dirTE  ] = &DD[dirTE  *size_Mat];
+      D.f[dirBW  ] = &DD[dirBW  *size_Mat];
+      D.f[dirBE  ] = &DD[dirBE  *size_Mat];
+      D.f[dirTW  ] = &DD[dirTW  *size_Mat];
+      D.f[dirTN  ] = &DD[dirTN  *size_Mat];
+      D.f[dirBS  ] = &DD[dirBS  *size_Mat];
+      D.f[dirBN  ] = &DD[dirBN  *size_Mat];
+      D.f[dirTS  ] = &DD[dirTS  *size_Mat];
+      D.f[dirZERO] = &DD[dirZERO*size_Mat];
+      D.f[dirTNE ] = &DD[dirTNE *size_Mat];
+      D.f[dirTSW ] = &DD[dirTSW *size_Mat];
+      D.f[dirTSE ] = &DD[dirTSE *size_Mat];
+      D.f[dirTNW ] = &DD[dirTNW *size_Mat];
+      D.f[dirBNE ] = &DD[dirBNE *size_Mat];
+      D.f[dirBSW ] = &DD[dirBSW *size_Mat];
+      D.f[dirBSE ] = &DD[dirBSE *size_Mat];
+      D.f[dirBNW ] = &DD[dirBNW *size_Mat];
+   } 
+   else
+   {
+      D.f[dirW   ] = &DD[dirE   *size_Mat];
+      D.f[dirE   ] = &DD[dirW   *size_Mat];
+      D.f[dirS   ] = &DD[dirN   *size_Mat];
+      D.f[dirN   ] = &DD[dirS   *size_Mat];
+      D.f[dirB   ] = &DD[dirT   *size_Mat];
+      D.f[dirT   ] = &DD[dirB   *size_Mat];
+      D.f[dirSW  ] = &DD[dirNE  *size_Mat];
+      D.f[dirNE  ] = &DD[dirSW  *size_Mat];
+      D.f[dirNW  ] = &DD[dirSE  *size_Mat];
+      D.f[dirSE  ] = &DD[dirNW  *size_Mat];
+      D.f[dirBW  ] = &DD[dirTE  *size_Mat];
+      D.f[dirTE  ] = &DD[dirBW  *size_Mat];
+      D.f[dirTW  ] = &DD[dirBE  *size_Mat];
+      D.f[dirBE  ] = &DD[dirTW  *size_Mat];
+      D.f[dirBS  ] = &DD[dirTN  *size_Mat];
+      D.f[dirTN  ] = &DD[dirBS  *size_Mat];
+      D.f[dirTS  ] = &DD[dirBN  *size_Mat];
+      D.f[dirBN  ] = &DD[dirTS  *size_Mat];
+      D.f[dirZERO] = &DD[dirZERO*size_Mat];
+      D.f[dirTNE ] = &DD[dirBSW *size_Mat];
+      D.f[dirTSW ] = &DD[dirBNE *size_Mat];
+      D.f[dirTSE ] = &DD[dirBNW *size_Mat];
+      D.f[dirTNW ] = &DD[dirBSE *size_Mat];
+      D.f[dirBNE ] = &DD[dirTSW *size_Mat];
+      D.f[dirBSW ] = &DD[dirTNE *size_Mat];
+      D.f[dirBSE ] = &DD[dirTNW *size_Mat];
+      D.f[dirBNW ] = &DD[dirTSE *size_Mat];
+   }
+   ////////////////////////////////////////////////////////////////////////////////
+   const unsigned  x = threadIdx.x;  // Globaler x-Index 
+   const unsigned  y = blockIdx.x;   // Globaler y-Index 
+   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+
+   const unsigned nx = blockDim.x;
+   const unsigned ny = gridDim.x;
+
+   const unsigned k = nx*(ny*z + y) + x;
+   //////////////////////////////////////////////////////////////////////////
+
+   if(k<sizeQ/*kQ*/)
+   {
+      ////////////////////////////////////////////////////////////////////////////////
+      real *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB, 
+            *q_dirNE,  *q_dirSW,  *q_dirSE,  *q_dirNW,  *q_dirTE,  *q_dirBW,
+            *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
+            *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
+            *q_dirBSE, *q_dirBNW; 
+      q_dirE   = &QQ[dirE   *sizeQ];
+      q_dirW   = &QQ[dirW   *sizeQ];
+      q_dirN   = &QQ[dirN   *sizeQ];
+      q_dirS   = &QQ[dirS   *sizeQ];
+      q_dirT   = &QQ[dirT   *sizeQ];
+      q_dirB   = &QQ[dirB   *sizeQ];
+      q_dirNE  = &QQ[dirNE  *sizeQ];
+      q_dirSW  = &QQ[dirSW  *sizeQ];
+      q_dirSE  = &QQ[dirSE  *sizeQ];
+      q_dirNW  = &QQ[dirNW  *sizeQ];
+      q_dirTE  = &QQ[dirTE  *sizeQ];
+      q_dirBW  = &QQ[dirBW  *sizeQ];
+      q_dirBE  = &QQ[dirBE  *sizeQ];
+      q_dirTW  = &QQ[dirTW  *sizeQ];
+      q_dirTN  = &QQ[dirTN  *sizeQ];
+      q_dirBS  = &QQ[dirBS  *sizeQ];
+      q_dirBN  = &QQ[dirBN  *sizeQ];
+      q_dirTS  = &QQ[dirTS  *sizeQ];
+      q_dirTNE = &QQ[dirTNE *sizeQ];
+      q_dirTSW = &QQ[dirTSW *sizeQ];
+      q_dirTSE = &QQ[dirTSE *sizeQ];
+      q_dirTNW = &QQ[dirTNW *sizeQ];
+      q_dirBNE = &QQ[dirBNE *sizeQ];
+      q_dirBSW = &QQ[dirBSW *sizeQ];
+      q_dirBSE = &QQ[dirBSE *sizeQ];
+      q_dirBNW = &QQ[dirBNW *sizeQ];
+      ////////////////////////////////////////////////////////////////////////////////
+      //index
+      unsigned int KQK  = k_Q[k];
+      unsigned int kzero= KQK;      //get right adress of post-coll f's
+      unsigned int ke   = KQK;
+      unsigned int kw   = neighborX[KQK];
+      unsigned int kn   = KQK;
+      unsigned int ks   = neighborY[KQK];
+      unsigned int kt   = KQK;
+      unsigned int kb   = neighborZ[KQK];
+      unsigned int ksw  = neighborY[kw];
+      unsigned int kne  = KQK;
+      unsigned int kse  = ks;
+      unsigned int knw  = kw;
+      unsigned int kbw  = neighborZ[kw];
+      unsigned int kte  = KQK;
+      unsigned int kbe  = kb;
+      unsigned int ktw  = kw;
+      unsigned int kbs  = neighborZ[ks];
+      unsigned int ktn  = KQK;
+      unsigned int kbn  = kb;
+      unsigned int kts  = ks;
+      unsigned int ktse = ks;
+      unsigned int kbnw = kbw;
+      unsigned int ktnw = kw;
+      unsigned int kbse = kbs;
+      unsigned int ktsw = ksw;
+      unsigned int kbne = kb;
+      unsigned int ktne = KQK;
+      unsigned int kbsw = neighborZ[ksw];
+      ////////////////////////////////////////////////////////////////////////////////
+      real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
+         f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
+
+      f_W    = (D.f[dirE   ])[ke   ];     //post-coll f's
+      f_E    = (D.f[dirW   ])[kw   ];
+      f_S    = (D.f[dirN   ])[kn   ];
+      f_N    = (D.f[dirS   ])[ks   ];
+      f_B    = (D.f[dirT   ])[kt   ];
+      f_T    = (D.f[dirB   ])[kb   ];
+      f_SW   = (D.f[dirNE  ])[kne  ];
+      f_NE   = (D.f[dirSW  ])[ksw  ];
+      f_NW   = (D.f[dirSE  ])[kse  ];
+      f_SE   = (D.f[dirNW  ])[knw  ];
+      f_BW   = (D.f[dirTE  ])[kte  ];
+      f_TE   = (D.f[dirBW  ])[kbw  ];
+      f_TW   = (D.f[dirBE  ])[kbe  ];
+      f_BE   = (D.f[dirTW  ])[ktw  ];
+      f_BS   = (D.f[dirTN  ])[ktn  ];
+      f_TN   = (D.f[dirBS  ])[kbs  ];
+      f_TS   = (D.f[dirBN  ])[kbn  ];
+      f_BN   = (D.f[dirTS  ])[kts  ];
+      f_BSW  = (D.f[dirTNE ])[ktne ];
+      f_BNE  = (D.f[dirTSW ])[ktsw ];
+      f_BNW  = (D.f[dirTSE ])[ktse ];
+      f_BSE  = (D.f[dirTNW ])[ktnw ];
+      f_TSW  = (D.f[dirBNE ])[kbne ];
+      f_TNE  = (D.f[dirBSW ])[kbsw ];
+      f_TNW  = (D.f[dirBSE ])[kbse ];
+      f_TSE  = (D.f[dirBNW ])[kbnw ];
+
+      ////////////////////////////////////////////////////////////////////////////////
+      real vx1, vx2, vx3, drho, feq, q;
+      drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
+                f_BN + f_TS + f_TN + f_BS + f_BE + f_TW + f_TE + f_BW + f_SE + f_NW + f_NE + f_SW + 
+                f_T + f_B + f_N + f_S + f_E + f_W + ((D.f[dirZERO])[kzero]); 
+
+      vx1    =  (((f_TSE - f_BNW) - (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
+                ((f_BE - f_TW)   + (f_TE - f_BW))   + ((f_SE - f_NW)   + (f_NE - f_SW)) +
+                (f_E - f_W)) / (c1o1 + drho); 
+         
+
+      vx2    =   ((-(f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
+                 ((f_BN - f_TS)   + (f_TN - f_BS))    + (-(f_SE - f_NW)  + (f_NE - f_SW)) +
+                 (f_N - f_S)) / (c1o1 + drho); 
+
+      vx3    =   (((f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) + (f_TSW - f_BNE)) +
+                 (-(f_BN - f_TS)  + (f_TN - f_BS))   + ((f_TE - f_BW)   - (f_BE - f_TW)) +
+                 (f_T - f_B)) / (c1o1 + drho); 
+
+      real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3) * (c1o1 + drho);
+      
+      real om_turb = om1 / (c1o1 + c3o1*om1*max(c0o1, turbViscosity[k_Q[k]]));
+      //////////////////////////////////////////////////////////////////////////
+      if (evenOrOdd==false)      //get adress where incoming f's should be written to
+      {
+         D.f[dirE   ] = &DD[dirE   *size_Mat];
+         D.f[dirW   ] = &DD[dirW   *size_Mat];
+         D.f[dirN   ] = &DD[dirN   *size_Mat];
+         D.f[dirS   ] = &DD[dirS   *size_Mat];
+         D.f[dirT   ] = &DD[dirT   *size_Mat];
+         D.f[dirB   ] = &DD[dirB   *size_Mat];
+         D.f[dirNE  ] = &DD[dirNE  *size_Mat];
+         D.f[dirSW  ] = &DD[dirSW  *size_Mat];
+         D.f[dirSE  ] = &DD[dirSE  *size_Mat];
+         D.f[dirNW  ] = &DD[dirNW  *size_Mat];
+         D.f[dirTE  ] = &DD[dirTE  *size_Mat];
+         D.f[dirBW  ] = &DD[dirBW  *size_Mat];
+         D.f[dirBE  ] = &DD[dirBE  *size_Mat];
+         D.f[dirTW  ] = &DD[dirTW  *size_Mat];
+         D.f[dirTN  ] = &DD[dirTN  *size_Mat];
+         D.f[dirBS  ] = &DD[dirBS  *size_Mat];
+         D.f[dirBN  ] = &DD[dirBN  *size_Mat];
+         D.f[dirTS  ] = &DD[dirTS  *size_Mat];
+         D.f[dirZERO] = &DD[dirZERO*size_Mat];
+         D.f[dirTNE ] = &DD[dirTNE *size_Mat];
+         D.f[dirTSW ] = &DD[dirTSW *size_Mat];
+         D.f[dirTSE ] = &DD[dirTSE *size_Mat];
+         D.f[dirTNW ] = &DD[dirTNW *size_Mat];
+         D.f[dirBNE ] = &DD[dirBNE *size_Mat];
+         D.f[dirBSW ] = &DD[dirBSW *size_Mat];
+         D.f[dirBSE ] = &DD[dirBSE *size_Mat];
+         D.f[dirBNW ] = &DD[dirBNW *size_Mat];
+      } 
+      else
+      {
+         D.f[dirW   ] = &DD[dirE   *size_Mat];
+         D.f[dirE   ] = &DD[dirW   *size_Mat];
+         D.f[dirS   ] = &DD[dirN   *size_Mat];
+         D.f[dirN   ] = &DD[dirS   *size_Mat];
+         D.f[dirB   ] = &DD[dirT   *size_Mat];
+         D.f[dirT   ] = &DD[dirB   *size_Mat];
+         D.f[dirSW  ] = &DD[dirNE  *size_Mat];
+         D.f[dirNE  ] = &DD[dirSW  *size_Mat];
+         D.f[dirNW  ] = &DD[dirSE  *size_Mat];
+         D.f[dirSE  ] = &DD[dirNW  *size_Mat];
+         D.f[dirBW  ] = &DD[dirTE  *size_Mat];
+         D.f[dirTE  ] = &DD[dirBW  *size_Mat];
+         D.f[dirTW  ] = &DD[dirBE  *size_Mat];
+         D.f[dirBE  ] = &DD[dirTW  *size_Mat];
+         D.f[dirBS  ] = &DD[dirTN  *size_Mat];
+         D.f[dirTN  ] = &DD[dirBS  *size_Mat];
+         D.f[dirTS  ] = &DD[dirBN  *size_Mat];
+         D.f[dirBN  ] = &DD[dirTS  *size_Mat];
+         D.f[dirZERO] = &DD[dirZERO*size_Mat];
+         D.f[dirTNE ] = &DD[dirBSW *size_Mat];
+         D.f[dirTSW ] = &DD[dirBNE *size_Mat];
+         D.f[dirTSE ] = &DD[dirBNW *size_Mat];
+         D.f[dirTNW ] = &DD[dirBSE *size_Mat];
+         D.f[dirBNE ] = &DD[dirTSW *size_Mat];
+         D.f[dirBSW ] = &DD[dirTNE *size_Mat];
+         D.f[dirBSE ] = &DD[dirTNW *size_Mat];
+         D.f[dirBNW ] = &DD[dirTSE *size_Mat];
+      }
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      //Compute incoming f's with zero wall velocity
+      ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      // incoming f's from bounce back
+      real f_E_in = 0.0,  f_W_in = 0.0,  f_N_in = 0.0,  f_S_in = 0.0,  f_T_in = 0.0,  f_B_in = 0.0,   f_NE_in = 0.0,  f_SW_in = 0.0,  f_SE_in = 0.0,  f_NW_in = 0.0,  f_TE_in = 0.0,  f_BW_in = 0.0,  f_BE_in = 0.0, f_TW_in = 0.0, f_TN_in = 0.0, f_BS_in = 0.0, f_BN_in = 0.0, f_TS_in = 0.0, f_TNE_in = 0.0, f_TSW_in = 0.0, f_TSE_in = 0.0, f_TNW_in = 0.0, f_BNE_in = 0.0, f_BSW_in = 0.0, f_BSE_in = 0.0, f_BNW_in = 0.0;
+      // momentum exchanged with wall at rest
+      real wallMomentumX = 0.0, wallMomentumY = 0.0, wallMomentumZ = 0.0;
+
+      q = q_dirE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c2o27* (drho/*+three*( vx1        )*/+c9o2*( vx1        )*( vx1        ) * (c1o1 + drho)-cu_sq); 
+         f_W_in=(c1o1-q)/(c1o1+q)*(f_E-f_W+(f_E+f_W-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_E+f_W))/(c1o1+q) - c2o27 * drho;
+         wallMomentumX += f_E+f_W_in;
+      }
+
+      q = q_dirW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c2o27* (drho/*+three*(-vx1        )*/+c9o2*(-vx1        )*(-vx1        ) * (c1o1 + drho)-cu_sq); 
+         f_E_in=(c1o1-q)/(c1o1+q)*(f_W-f_E+(f_W+f_E-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_W+f_E))/(c1o1+q) - c2o27 * drho;
+         wallMomentumX -= f_W+f_E_in;
+      }
+
+      q = q_dirN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c2o27* (drho/*+three*(    vx2     )*/+c9o2*(     vx2    )*(     vx2    ) * (c1o1 + drho)-cu_sq); 
+         f_S_in=(c1o1-q)/(c1o1+q)*(f_N-f_S+(f_N+f_S-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_N+f_S))/(c1o1+q) - c2o27 * drho;
+         wallMomentumY += f_N+f_S_in;
+      }
+
+      q = q_dirS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c2o27* (drho/*+three*(   -vx2     )*/+c9o2*(    -vx2    )*(    -vx2    ) * (c1o1 + drho)-cu_sq); 
+         f_N_in=(c1o1-q)/(c1o1+q)*(f_S-f_N+(f_S+f_N-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_S+f_N))/(c1o1+q) - c2o27 * drho;
+         wallMomentumY -= f_S+f_N_in;
+      }
+
+      q = q_dirT[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c2o27* (drho/*+three*(         vx3)*/+c9o2*(         vx3)*(         vx3) * (c1o1 + drho)-cu_sq); 
+         f_B_in=(c1o1-q)/(c1o1+q)*(f_T-f_B+(f_T+f_B-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_T+f_B))/(c1o1+q) - c2o27 * drho;
+         wallMomentumZ += f_T+f_B_in;
+      }
+
+      q = q_dirB[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c2o27* (drho/*+three*(        -vx3)*/+c9o2*(        -vx3)*(        -vx3) * (c1o1 + drho)-cu_sq); 
+         f_T_in=(c1o1-q)/(c1o1+q)*(f_B-f_T+(f_B+f_T-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_B+f_T))/(c1o1+q) - c2o27 * drho;
+         wallMomentumZ -= f_B+f_T_in;
+      }
+
+      q = q_dirNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho/*+three*( vx1+vx2    )*/+c9o2*( vx1+vx2    )*( vx1+vx2    ) * (c1o1 + drho)-cu_sq); 
+         f_SW_in=(c1o1-q)/(c1o1+q)*(f_NE-f_SW+(f_NE+f_SW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_NE+f_SW))/(c1o1+q) - c1o54 * drho;
+         wallMomentumX += f_NE+f_SW_in;
+         wallMomentumY += f_NE+f_SW_in;
+      }
+
+      q = q_dirSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho/*+three*(-vx1-vx2    )*/+c9o2*(-vx1-vx2    )*(-vx1-vx2    ) * (c1o1 + drho)-cu_sq); 
+         f_NE_in=(c1o1-q)/(c1o1+q)*(f_SW-f_NE+(f_SW+f_NE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_SW+f_NE))/(c1o1+q) - c1o54 * drho;
+         wallMomentumX -= f_SW+f_NE_in;
+         wallMomentumY -= f_SW+f_NE_in;
+      }
+
+      q = q_dirSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho/*+three*( vx1-vx2    )*/+c9o2*( vx1-vx2    )*( vx1-vx2    ) * (c1o1 + drho)-cu_sq); 
+         f_NW_in=(c1o1-q)/(c1o1+q)*(f_SE-f_NW+(f_SE+f_NW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_SE+f_NW))/(c1o1+q) - c1o54 * drho;
+         wallMomentumX += f_SE+f_NW_in;
+         wallMomentumY -= f_SE+f_NW_in;
+      }
+
+      q = q_dirNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho/*+three*(-vx1+vx2    )*/+c9o2*(-vx1+vx2    )*(-vx1+vx2    ) * (c1o1 + drho)-cu_sq); 
+         f_SE_in=(c1o1-q)/(c1o1+q)*(f_NW-f_SE+(f_NW+f_SE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_NW+f_SE))/(c1o1+q) - c1o54 * drho;
+         wallMomentumX -= f_NW+f_SE_in;
+         wallMomentumY += f_NW+f_SE_in;
+      }
+
+      q = q_dirTE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho/*+three*( vx1    +vx3)*/+c9o2*( vx1    +vx3)*( vx1    +vx3) * (c1o1 + drho)-cu_sq); 
+         f_BW_in=(c1o1-q)/(c1o1+q)*(f_TE-f_BW+(f_TE+f_BW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TE+f_BW))/(c1o1+q) - c1o54 * drho;
+         wallMomentumX += f_TE+f_BW_in;
+         wallMomentumZ += f_TE+f_BW_in;
+      }
+
+      q = q_dirBW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho/*+three*(-vx1    -vx3)*/+c9o2*(-vx1    -vx3)*(-vx1    -vx3) * (c1o1 + drho)-cu_sq); 
+         f_TE_in=(c1o1-q)/(c1o1+q)*(f_BW-f_TE+(f_BW+f_TE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BW+f_TE))/(c1o1+q) - c1o54 * drho;
+         wallMomentumX -= f_BW+f_TE_in;
+         wallMomentumZ -= f_BW+f_TE_in;
+      }
+
+      q = q_dirBE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho/*+three*( vx1    -vx3)*/+c9o2*( vx1    -vx3)*( vx1    -vx3) * (c1o1 + drho)-cu_sq); 
+         f_TW_in=(c1o1-q)/(c1o1+q)*(f_BE-f_TW+(f_BE+f_TW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BE+f_TW))/(c1o1+q) - c1o54 * drho;
+         wallMomentumX += f_BE+f_TW_in;
+         wallMomentumZ -= f_BE+f_TW_in;
+      }
+
+      q = q_dirTW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho/*+three*(-vx1    +vx3)*/+c9o2*(-vx1    +vx3)*(-vx1    +vx3) * (c1o1 + drho)-cu_sq); 
+         f_BE_in=(c1o1-q)/(c1o1+q)*(f_TW-f_BE+(f_TW+f_BE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TW+f_BE))/(c1o1+q) - c1o54 * drho;
+         wallMomentumX -= f_TW+f_BE_in;
+         wallMomentumZ += f_TW+f_BE_in;
+      }
+
+      q = q_dirTN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho/*+three*(     vx2+vx3)*/+c9o2*(     vx2+vx3)*(     vx2+vx3) * (c1o1 + drho)-cu_sq); 
+         f_BS_in=(c1o1-q)/(c1o1+q)*(f_TN-f_BS+(f_TN+f_BS-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TN+f_BS))/(c1o1+q) - c1o54 * drho;
+         wallMomentumY += f_TN+f_BS_in;
+         wallMomentumZ += f_TN+f_BS_in;
+      }
+
+      q = q_dirBS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho/*+three*(    -vx2-vx3)*/+c9o2*(    -vx2-vx3)*(    -vx2-vx3) * (c1o1 + drho)-cu_sq); 
+         f_TN_in=(c1o1-q)/(c1o1+q)*(f_BS-f_TN+(f_BS+f_TN-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BS+f_TN))/(c1o1+q) - c1o54 * drho;
+         wallMomentumY -= f_BS+f_TN_in;
+         wallMomentumZ -= f_BS+f_TN_in;
+      }
+
+      q = q_dirBN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho/*+three*(     vx2-vx3)*/+c9o2*(     vx2-vx3)*(     vx2-vx3) * (c1o1 + drho)-cu_sq); 
+         f_TS_in=(c1o1-q)/(c1o1+q)*(f_BN-f_TS+(f_BN+f_TS-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BN+f_TS))/(c1o1+q) - c1o54 * drho;
+         wallMomentumY += f_BN+f_TS_in;
+         wallMomentumZ -= f_BN+f_TS_in;
+      }
+
+      q = q_dirTS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o54* (drho/*+three*(    -vx2+vx3)*/+c9o2*(    -vx2+vx3)*(    -vx2+vx3) * (c1o1 + drho)-cu_sq); 
+         f_BN_in=(c1o1-q)/(c1o1+q)*(f_TS-f_BN+(f_TS+f_BN-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TS+f_BN))/(c1o1+q) - c1o54 * drho;
+         wallMomentumY -= f_TS+f_BN_in;
+         wallMomentumZ += f_TS+f_BN_in;
+      }
+
+      q = q_dirTNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho/*+three*( vx1+vx2+vx3)*/+c9o2*( vx1+vx2+vx3)*( vx1+vx2+vx3) * (c1o1 + drho)-cu_sq); 
+         f_BSW_in=(c1o1-q)/(c1o1+q)*(f_TNE-f_BSW+(f_TNE+f_BSW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TNE+f_BSW))/(c1o1+q) - c1o216 * drho;
+         wallMomentumX += f_TNE+f_BSW_in;
+         wallMomentumY += f_TNE+f_BSW_in;
+         wallMomentumZ += f_TNE+f_BSW_in;
+      }
+
+      q = q_dirBSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho/*+three*(-vx1-vx2-vx3)*/+c9o2*(-vx1-vx2-vx3)*(-vx1-vx2-vx3) * (c1o1 + drho)-cu_sq); 
+         f_TNE_in=(c1o1-q)/(c1o1+q)*(f_BSW-f_TNE+(f_BSW+f_TNE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BSW+f_TNE))/(c1o1+q) - c1o216 * drho;
+         wallMomentumX -= f_BSW+f_TNE_in;
+         wallMomentumY -= f_BSW+f_TNE_in;
+         wallMomentumZ -= f_BSW+f_TNE_in;
+      }
+
+      q = q_dirBNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho/*+three*( vx1+vx2-vx3)*/+c9o2*( vx1+vx2-vx3)*( vx1+vx2-vx3) * (c1o1 + drho)-cu_sq); 
+         f_TSW_in=(c1o1-q)/(c1o1+q)*(f_BNE-f_TSW+(f_BNE+f_TSW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BNE+f_TSW))/(c1o1+q) - c1o216 * drho;
+         wallMomentumX += f_BNE+f_TSW_in;
+         wallMomentumY += f_BNE+f_TSW_in;
+         wallMomentumZ -= f_BNE+f_TSW_in;
+      }
+
+      q = q_dirTSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho/*+three*(-vx1-vx2+vx3)*/+c9o2*(-vx1-vx2+vx3)*(-vx1-vx2+vx3) * (c1o1 + drho)-cu_sq); 
+         f_BNE_in=(c1o1-q)/(c1o1+q)*(f_TSW-f_BNE+(f_TSW+f_BNE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TSW+f_BNE))/(c1o1+q) - c1o216 * drho;
+         wallMomentumX -= f_TSW+f_BNE_in;
+         wallMomentumY -= f_TSW+f_BNE_in;
+         wallMomentumZ += f_TSW+f_BNE_in;
+      }
+
+      q = q_dirTSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho/*+three*( vx1-vx2+vx3)*/+c9o2*( vx1-vx2+vx3)*( vx1-vx2+vx3) * (c1o1 + drho)-cu_sq); 
+         f_BNW_in=(c1o1-q)/(c1o1+q)*(f_TSE-f_BNW+(f_TSE+f_BNW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TSE+f_BNW))/(c1o1+q) - c1o216 * drho;
+         wallMomentumX += f_TSE+f_BNW_in;
+         wallMomentumY -= f_TSE+f_BNW_in;
+         wallMomentumZ += f_TSE+f_BNW_in;
+      }
+
+      q = q_dirBNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho/*+three*(-vx1+vx2-vx3)*/+c9o2*(-vx1+vx2-vx3)*(-vx1+vx2-vx3) * (c1o1 + drho)-cu_sq); 
+         f_TSE_in=(c1o1-q)/(c1o1+q)*(f_BNW-f_TSE+(f_BNW+f_TSE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BNW+f_TSE))/(c1o1+q) - c1o216 * drho;
+         wallMomentumX -= f_BNW+f_TSE_in;
+         wallMomentumY += f_BNW+f_TSE_in;
+         wallMomentumZ -= f_BNW+f_TSE_in;
+      }
+
+      q = q_dirBSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho/*+three*( vx1-vx2-vx3)*/+c9o2*( vx1-vx2-vx3)*( vx1-vx2-vx3) * (c1o1 + drho)-cu_sq); 
+         f_TNW_in=(c1o1-q)/(c1o1+q)*(f_BSE-f_TNW+(f_BSE+f_TNW-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_BSE+f_TNW))/(c1o1+q) - c1o216 * drho;
+         wallMomentumX += f_BSE+f_TNW_in;
+         wallMomentumY -= f_BSE+f_TNW_in;
+         wallMomentumZ -= f_BSE+f_TNW_in;
+      }
+
+      q = q_dirTNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         feq=c1o216*(drho/*+three*(-vx1+vx2+vx3)*/+c9o2*(-vx1+vx2+vx3)*(-vx1+vx2+vx3) * (c1o1 + drho)-cu_sq); 
+         f_BSE_in=(c1o1-q)/(c1o1+q)*(f_TNW-f_BSE+(f_TNW+f_BSE-c2o1*feq*om_turb)/(c1o1-om_turb))*c1o2+(q*(f_TNW+f_BSE))/(c1o1+q) - c1o216 * drho;
+         wallMomentumX -= f_TNW+f_BSE_in;
+         wallMomentumY += f_TNW+f_BSE_in;
+         wallMomentumZ += f_TNW+f_BSE_in;
+      }
+
+      // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      // //Compute wall velocity
+      // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      real VeloX=0.0, VeloY=0.0, VeloZ=0.0; 
+
+      q = 0.5f;
+      real eps = 0.001f;
+
+      iMEM( k, k_N[k], 
+            normalX, normalY, normalZ,
+            vx, vy, vz,
+            vx_el,      vy_el,      vz_el,
+            vx_w_mean,  vy_w_mean,  vz_w_mean,
+            vx1,        vx2,        vx3,
+            c1o1+drho,
+            samplingOffset,
+            q,
+            1.0+q,
+            eps,
+            z0,
+            hasWallModelMonitor,
+            u_star_monitor,
+            wallMomentumX, wallMomentumY, wallMomentumZ,
+            VeloX, VeloY, VeloZ);
+
+      // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      // //Add wall velocity and write f's
+      // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      q = q_dirE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirW])[kw] = f_W_in - (c6o1*c2o27*( VeloX     ))/(c1o1+q);
+         wallMomentumX += -(c6o1*c2o27*( VeloX     ))/(c1o1+q);
+      }
+
+      q = q_dirW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirE])[ke] = f_E_in - (c6o1*c2o27*(-VeloX     ))/(c1o1+q);
+         wallMomentumX -= - (c6o1*c2o27*(-VeloX     ))/(c1o1+q);
+      }
+
+      q = q_dirN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirS])[ks] = f_S_in - (c6o1*c2o27*( VeloY     ))/(c1o1+q);
+         wallMomentumY += - (c6o1*c2o27*( VeloY     ))/(c1o1+q);
+      }
+
+      q = q_dirS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirN])[kn] = f_N_in - (c6o1*c2o27*(-VeloY     ))/(c1o1+q);
+         wallMomentumY -=  -(c6o1*c2o27*(-VeloY     ))/(c1o1+q);
+      }
+
+      q = q_dirT[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirB])[kb] = f_B_in - (c6o1*c2o27*( VeloZ     ))/(c1o1+q);
+         wallMomentumZ += - (c6o1*c2o27*( VeloZ     ))/(c1o1+q);
+      }
+
+      q = q_dirB[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirT])[kt] = f_T_in - (c6o1*c2o27*(-VeloZ     ))/(c1o1+q);
+         wallMomentumZ -= -(c6o1*c2o27*(-VeloZ     ))/(c1o1+q);
+      }
+
+      q = q_dirNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirSW])[ksw] = f_SW_in - (c6o1*c1o54*(VeloX+VeloY))/(c1o1+q);
+         wallMomentumX +=  -(c6o1*c1o54*(VeloX+VeloY))/(c1o1+q);
+         wallMomentumY +=  -(c6o1*c1o54*(VeloX+VeloY))/(c1o1+q);
+      }
+
+      q = q_dirSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirNE])[kne] = f_NE_in - (c6o1*c1o54*(-VeloX-VeloY))/(c1o1+q);
+         wallMomentumX -= - (c6o1*c1o54*(-VeloX-VeloY))/(c1o1+q);
+         wallMomentumY -= - (c6o1*c1o54*(-VeloX-VeloY))/(c1o1+q);
+      }
+
+      q = q_dirSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirNW])[knw] = f_NW_in - (c6o1*c1o54*( VeloX-VeloY))/(c1o1+q);
+         wallMomentumX += -(c6o1*c1o54*( VeloX-VeloY))/(c1o1+q);
+         wallMomentumY -= -(c6o1*c1o54*( VeloX-VeloY))/(c1o1+q);
+      }
+
+      q = q_dirNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirSE])[kse] = f_SE_in - (c6o1*c1o54*(-VeloX+VeloY))/(c1o1+q);
+         wallMomentumX -= - (c6o1*c1o54*(-VeloX+VeloY))/(c1o1+q);
+         wallMomentumY += - (c6o1*c1o54*(-VeloX+VeloY))/(c1o1+q);
+      }
+
+      q = q_dirTE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBW])[kbw] = f_BW_in - (c6o1*c1o54*( VeloX+VeloZ))/(c1o1+q); 
+         wallMomentumX += - (c6o1*c1o54*( VeloX+VeloZ))/(c1o1+q);
+         wallMomentumZ += - (c6o1*c1o54*( VeloX+VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirBW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTE])[kte] = f_TE_in - (c6o1*c1o54*(-VeloX-VeloZ))/(c1o1+q);
+         wallMomentumX -= - (c6o1*c1o54*(-VeloX-VeloZ))/(c1o1+q);
+         wallMomentumZ -= - (c6o1*c1o54*(-VeloX-VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirBE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTW])[ktw] = f_TW_in - (c6o1*c1o54*( VeloX-VeloZ))/(c1o1+q);
+         wallMomentumX += - (c6o1*c1o54*( VeloX-VeloZ))/(c1o1+q);
+         wallMomentumZ -= - (c6o1*c1o54*( VeloX-VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirTW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBE])[kbe] = f_BE_in - (c6o1*c1o54*(-VeloX+VeloZ))/(c1o1+q);
+         wallMomentumX -= - (c6o1*c1o54*(-VeloX+VeloZ))/(c1o1+q);
+         wallMomentumZ += - (c6o1*c1o54*(-VeloX+VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirTN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBS])[kbs] = f_BS_in - (c6o1*c1o54*( VeloY+VeloZ))/(c1o1+q);
+         wallMomentumY += - (c6o1*c1o54*( VeloY+VeloZ))/(c1o1+q);
+         wallMomentumZ += - (c6o1*c1o54*( VeloY+VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirBS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTN])[ktn] = f_TN_in - (c6o1*c1o54*( -VeloY-VeloZ))/(c1o1+q);
+         wallMomentumY -= - (c6o1*c1o54*( -VeloY-VeloZ))/(c1o1+q);
+         wallMomentumZ -= - (c6o1*c1o54*( -VeloY-VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirBN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTS])[kts] = f_TS_in - (c6o1*c1o54*( VeloY-VeloZ))/(c1o1+q);
+         wallMomentumY += - (c6o1*c1o54*( VeloY-VeloZ))/(c1o1+q);
+         wallMomentumZ -= - (c6o1*c1o54*( VeloY-VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirTS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBN])[kbn] = f_BN_in - (c6o1*c1o54*( -VeloY+VeloZ))/(c1o1+q);
+         wallMomentumY -= - (c6o1*c1o54*( -VeloY+VeloZ))/(c1o1+q);
+         wallMomentumZ += - (c6o1*c1o54*( -VeloY+VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirTNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBSW])[kbsw] = f_BSW_in - (c6o1*c1o216*( VeloX+VeloY+VeloZ))/(c1o1+q);
+         wallMomentumX += - (c6o1*c1o216*( VeloX+VeloY+VeloZ))/(c1o1+q);
+         wallMomentumY += - (c6o1*c1o216*( VeloX+VeloY+VeloZ))/(c1o1+q);
+         wallMomentumZ += - (c6o1*c1o216*( VeloX+VeloY+VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirBSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTNE])[ktne] = f_TNE_in - (c6o1*c1o216*(-VeloX-VeloY-VeloZ))/(c1o1+q);
+         wallMomentumX -= - (c6o1*c1o216*(-VeloX-VeloY-VeloZ))/(c1o1+q);
+         wallMomentumY -= - (c6o1*c1o216*(-VeloX-VeloY-VeloZ))/(c1o1+q);
+         wallMomentumZ -= - (c6o1*c1o216*(-VeloX-VeloY-VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirBNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTSW])[ktsw] = f_TSW_in - (c6o1*c1o216*( VeloX+VeloY-VeloZ))/(c1o1+q);
+         wallMomentumX += - (c6o1*c1o216*( VeloX+VeloY-VeloZ))/(c1o1+q);
+         wallMomentumY += - (c6o1*c1o216*( VeloX+VeloY-VeloZ))/(c1o1+q);
+         wallMomentumZ -= - (c6o1*c1o216*( VeloX+VeloY-VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirTSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBNE])[kbne] = f_BNE_in - (c6o1*c1o216*(-VeloX-VeloY+VeloZ))/(c1o1+q);
+         wallMomentumX -= - (c6o1*c1o216*(-VeloX-VeloY+VeloZ))/(c1o1+q);
+         wallMomentumY -= - (c6o1*c1o216*(-VeloX-VeloY+VeloZ))/(c1o1+q);
+         wallMomentumZ += - (c6o1*c1o216*(-VeloX-VeloY+VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirTSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBNW])[kbnw] = f_BNW_in - (c6o1*c1o216*( VeloX-VeloY+VeloZ))/(c1o1+q);
+         wallMomentumX += - (c6o1*c1o216*( VeloX-VeloY+VeloZ))/(c1o1+q);
+         wallMomentumY -= - (c6o1*c1o216*( VeloX-VeloY+VeloZ))/(c1o1+q);
+         wallMomentumZ += - (c6o1*c1o216*( VeloX-VeloY+VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirBNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTSE])[ktse] = f_TSE_in - (c6o1*c1o216*(-VeloX+VeloY-VeloZ))/(c1o1+q);
+         wallMomentumX -= - (c6o1*c1o216*(-VeloX+VeloY-VeloZ))/(c1o1+q);
+         wallMomentumY += - (c6o1*c1o216*(-VeloX+VeloY-VeloZ))/(c1o1+q);
+         wallMomentumZ -= - (c6o1*c1o216*(-VeloX+VeloY-VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirBSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTNW])[ktnw] = f_TNW_in - (c6o1*c1o216*( VeloX-VeloY-VeloZ))/(c1o1+q);
+         wallMomentumX += - (c6o1*c1o216*( VeloX-VeloY-VeloZ))/(c1o1+q);
+         wallMomentumY -= - (c6o1*c1o216*( VeloX-VeloY-VeloZ))/(c1o1+q);
+         wallMomentumZ -= - (c6o1*c1o216*( VeloX-VeloY-VeloZ))/(c1o1+q);
+      }
+
+      q = q_dirTNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBSE])[kbse] = f_BSE_in - (c6o1*c1o216*(-VeloX+VeloY+VeloZ))/(c1o1+q);
+         wallMomentumX -= - (c6o1*c1o216*(-VeloX+VeloY+VeloZ))/(c1o1+q);
+         wallMomentumY += - (c6o1*c1o216*(-VeloX+VeloY+VeloZ))/(c1o1+q);
+         wallMomentumZ += - (c6o1*c1o216*(-VeloX+VeloY+VeloZ))/(c1o1+q);
+      }
+
+      if(hasWallModelMonitor)
+      {
+         Fx_monitor[k] = wallMomentumX;
+         Fy_monitor[k] = wallMomentumY;
+         Fz_monitor[k] = wallMomentumZ;
+      }
+
+   }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+extern "C" __global__ void BBStressDevice27( real* DD, 
+											            int* k_Q, 
+                                             int* k_N, 
+                                             real* QQ,
+                                             unsigned int sizeQ,
+                                             real* vx,
+                                             real* vy,
+                                             real* vz,
+                                             real* normalX,
+                                             real* normalY,
+                                             real* normalZ,
+                                             real* vx_el,
+                                             real* vy_el,
+                                             real* vz_el,
+                                             real* vx_w_mean,
+                                             real* vy_w_mean,
+                                             real* vz_w_mean,
+                                             int* samplingOffset,
+                                             real* z0,
+                                             bool  hasWallModelMonitor,
+                                             real* u_star_monitor,
+                                             real* Fx_monitor,
+                                             real* Fy_monitor,
+                                             real* Fz_monitor,
+                                             unsigned int* neighborX,
+                                             unsigned int* neighborY,
+                                             unsigned int* neighborZ,
+                                             unsigned int size_Mat, 
+                                             bool evenOrOdd)
+{
+   Distributions27 D;
+   if (evenOrOdd==true)
+   {
+      D.f[dirE   ] = &DD[dirE   *size_Mat];
+      D.f[dirW   ] = &DD[dirW   *size_Mat];
+      D.f[dirN   ] = &DD[dirN   *size_Mat];
+      D.f[dirS   ] = &DD[dirS   *size_Mat];
+      D.f[dirT   ] = &DD[dirT   *size_Mat];
+      D.f[dirB   ] = &DD[dirB   *size_Mat];
+      D.f[dirNE  ] = &DD[dirNE  *size_Mat];
+      D.f[dirSW  ] = &DD[dirSW  *size_Mat];
+      D.f[dirSE  ] = &DD[dirSE  *size_Mat];
+      D.f[dirNW  ] = &DD[dirNW  *size_Mat];
+      D.f[dirTE  ] = &DD[dirTE  *size_Mat];
+      D.f[dirBW  ] = &DD[dirBW  *size_Mat];
+      D.f[dirBE  ] = &DD[dirBE  *size_Mat];
+      D.f[dirTW  ] = &DD[dirTW  *size_Mat];
+      D.f[dirTN  ] = &DD[dirTN  *size_Mat];
+      D.f[dirBS  ] = &DD[dirBS  *size_Mat];
+      D.f[dirBN  ] = &DD[dirBN  *size_Mat];
+      D.f[dirTS  ] = &DD[dirTS  *size_Mat];
+      D.f[dirZERO] = &DD[dirZERO*size_Mat];
+      D.f[dirTNE ] = &DD[dirTNE *size_Mat];
+      D.f[dirTSW ] = &DD[dirTSW *size_Mat];
+      D.f[dirTSE ] = &DD[dirTSE *size_Mat];
+      D.f[dirTNW ] = &DD[dirTNW *size_Mat];
+      D.f[dirBNE ] = &DD[dirBNE *size_Mat];
+      D.f[dirBSW ] = &DD[dirBSW *size_Mat];
+      D.f[dirBSE ] = &DD[dirBSE *size_Mat];
+      D.f[dirBNW ] = &DD[dirBNW *size_Mat];
+   } 
+   else
+   {
+      D.f[dirW   ] = &DD[dirE   *size_Mat];
+      D.f[dirE   ] = &DD[dirW   *size_Mat];
+      D.f[dirS   ] = &DD[dirN   *size_Mat];
+      D.f[dirN   ] = &DD[dirS   *size_Mat];
+      D.f[dirB   ] = &DD[dirT   *size_Mat];
+      D.f[dirT   ] = &DD[dirB   *size_Mat];
+      D.f[dirSW  ] = &DD[dirNE  *size_Mat];
+      D.f[dirNE  ] = &DD[dirSW  *size_Mat];
+      D.f[dirNW  ] = &DD[dirSE  *size_Mat];
+      D.f[dirSE  ] = &DD[dirNW  *size_Mat];
+      D.f[dirBW  ] = &DD[dirTE  *size_Mat];
+      D.f[dirTE  ] = &DD[dirBW  *size_Mat];
+      D.f[dirTW  ] = &DD[dirBE  *size_Mat];
+      D.f[dirBE  ] = &DD[dirTW  *size_Mat];
+      D.f[dirBS  ] = &DD[dirTN  *size_Mat];
+      D.f[dirTN  ] = &DD[dirBS  *size_Mat];
+      D.f[dirTS  ] = &DD[dirBN  *size_Mat];
+      D.f[dirBN  ] = &DD[dirTS  *size_Mat];
+      D.f[dirZERO] = &DD[dirZERO*size_Mat];
+      D.f[dirTNE ] = &DD[dirBSW *size_Mat];
+      D.f[dirTSW ] = &DD[dirBNE *size_Mat];
+      D.f[dirTSE ] = &DD[dirBNW *size_Mat];
+      D.f[dirTNW ] = &DD[dirBSE *size_Mat];
+      D.f[dirBNE ] = &DD[dirTSW *size_Mat];
+      D.f[dirBSW ] = &DD[dirTNE *size_Mat];
+      D.f[dirBSE ] = &DD[dirTNW *size_Mat];
+      D.f[dirBNW ] = &DD[dirTSE *size_Mat];
+   }
+   ////////////////////////////////////////////////////////////////////////////////
+   const unsigned  x = threadIdx.x;  // Globaler x-Index 
+   const unsigned  y = blockIdx.x;   // Globaler y-Index 
+   const unsigned  z = blockIdx.y;   // Globaler z-Index 
+
+   const unsigned nx = blockDim.x;
+   const unsigned ny = gridDim.x;
+
+   const unsigned k = nx*(ny*z + y) + x;
+   //////////////////////////////////////////////////////////////////////////
+
+   if(k<sizeQ)
+   {
+      ////////////////////////////////////////////////////////////////////////////////
+      real *q_dirE,   *q_dirW,   *q_dirN,   *q_dirS,   *q_dirT,   *q_dirB, 
+         *q_dirNE,  *q_dirSW,  *q_dirSE,  *q_dirNW,  *q_dirTE,  *q_dirBW,
+         *q_dirBE,  *q_dirTW,  *q_dirTN,  *q_dirBS,  *q_dirBN,  *q_dirTS,
+         *q_dirTNE, *q_dirTSW, *q_dirTSE, *q_dirTNW, *q_dirBNE, *q_dirBSW,
+         *q_dirBSE, *q_dirBNW; 
+      q_dirE   = &QQ[dirE   *sizeQ];
+      q_dirW   = &QQ[dirW   *sizeQ];
+      q_dirN   = &QQ[dirN   *sizeQ];
+      q_dirS   = &QQ[dirS   *sizeQ];
+      q_dirT   = &QQ[dirT   *sizeQ];
+      q_dirB   = &QQ[dirB   *sizeQ];
+      q_dirNE  = &QQ[dirNE  *sizeQ];
+      q_dirSW  = &QQ[dirSW  *sizeQ];
+      q_dirSE  = &QQ[dirSE  *sizeQ];
+      q_dirNW  = &QQ[dirNW  *sizeQ];
+      q_dirTE  = &QQ[dirTE  *sizeQ];
+      q_dirBW  = &QQ[dirBW  *sizeQ];
+      q_dirBE  = &QQ[dirBE  *sizeQ];
+      q_dirTW  = &QQ[dirTW  *sizeQ];
+      q_dirTN  = &QQ[dirTN  *sizeQ];
+      q_dirBS  = &QQ[dirBS  *sizeQ];
+      q_dirBN  = &QQ[dirBN  *sizeQ];
+      q_dirTS  = &QQ[dirTS  *sizeQ];
+      q_dirTNE = &QQ[dirTNE *sizeQ];
+      q_dirTSW = &QQ[dirTSW *sizeQ];
+      q_dirTSE = &QQ[dirTSE *sizeQ];
+      q_dirTNW = &QQ[dirTNW *sizeQ];
+      q_dirBNE = &QQ[dirBNE *sizeQ];
+      q_dirBSW = &QQ[dirBSW *sizeQ];
+      q_dirBSE = &QQ[dirBSE *sizeQ];
+      q_dirBNW = &QQ[dirBNW *sizeQ];
+      ////////////////////////////////////////////////////////////////////////////////
+      //index
+      unsigned int KQK  = k_Q[k];
+      unsigned int kzero= KQK;
+      unsigned int ke   = KQK;
+      unsigned int kw   = neighborX[KQK];
+      unsigned int kn   = KQK;
+      unsigned int ks   = neighborY[KQK];
+      unsigned int kt   = KQK;
+      unsigned int kb   = neighborZ[KQK];
+      unsigned int ksw  = neighborY[kw];
+      unsigned int kne  = KQK;
+      unsigned int kse  = ks;
+      unsigned int knw  = kw;
+      unsigned int kbw  = neighborZ[kw];
+      unsigned int kte  = KQK;
+      unsigned int kbe  = kb;
+      unsigned int ktw  = kw;
+      unsigned int kbs  = neighborZ[ks];
+      unsigned int ktn  = KQK;
+      unsigned int kbn  = kb;
+      unsigned int kts  = ks;
+      unsigned int ktse = ks;
+      unsigned int kbnw = kbw;
+      unsigned int ktnw = kw;
+      unsigned int kbse = kbs;
+      unsigned int ktsw = ksw;
+      unsigned int kbne = kb;
+      unsigned int ktne = KQK;
+      unsigned int kbsw = neighborZ[ksw];
+     
+      ////////////////////////////////////////////////////////////////////////////////
+      real f_E,  f_W,  f_N,  f_S,  f_T,  f_B,   f_NE,  f_SW,  f_SE,  f_NW,  f_TE,  f_BW,  f_BE,
+         f_TW, f_TN, f_BS, f_BN, f_TS, f_TNE, f_TSW, f_TSE, f_TNW, f_BNE, f_BSW, f_BSE, f_BNW;
+
+      f_W    = (D.f[dirE   ])[ke   ];
+      f_E    = (D.f[dirW   ])[kw   ];
+      f_S    = (D.f[dirN   ])[kn   ];
+      f_N    = (D.f[dirS   ])[ks   ];
+      f_B    = (D.f[dirT   ])[kt   ];
+      f_T    = (D.f[dirB   ])[kb   ];
+      f_SW   = (D.f[dirNE  ])[kne  ];
+      f_NE   = (D.f[dirSW  ])[ksw  ];
+      f_NW   = (D.f[dirSE  ])[kse  ];
+      f_SE   = (D.f[dirNW  ])[knw  ];
+      f_BW   = (D.f[dirTE  ])[kte  ];
+      f_TE   = (D.f[dirBW  ])[kbw  ];
+      f_TW   = (D.f[dirBE  ])[kbe  ];
+      f_BE   = (D.f[dirTW  ])[ktw  ];
+      f_BS   = (D.f[dirTN  ])[ktn  ];
+      f_TN   = (D.f[dirBS  ])[kbs  ];
+      f_TS   = (D.f[dirBN  ])[kbn  ];
+      f_BN   = (D.f[dirTS  ])[kts  ];
+      f_BSW  = (D.f[dirTNE ])[ktne ];
+      f_BNE  = (D.f[dirTSW ])[ktsw ];
+      f_BNW  = (D.f[dirTSE ])[ktse ];
+      f_BSE  = (D.f[dirTNW ])[ktnw ];
+      f_TSW  = (D.f[dirBNE ])[kbne ];
+      f_TNE  = (D.f[dirBSW ])[kbsw ];
+      f_TNW  = (D.f[dirBSE ])[kbse ];
+      f_TSE  = (D.f[dirBNW ])[kbnw ];
+
+      ////////////////////////////////////////////////////////////////////////////////
+      real vx1, vx2, vx3, drho;
+      drho   =  f_TSE + f_TNW + f_TNE + f_TSW + f_BSE + f_BNW + f_BNE + f_BSW +
+                f_BN + f_TS + f_TN + f_BS + f_BE + f_TW + f_TE + f_BW + f_SE + f_NW + f_NE + f_SW + 
+                f_T + f_B + f_N + f_S + f_E + f_W + ((D.f[dirZERO])[kzero]); 
+
+      vx1    =  (((f_TSE - f_BNW) - (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
+                ((f_BE - f_TW)   + (f_TE - f_BW))   + ((f_SE - f_NW)   + (f_NE - f_SW)) +
+                (f_E - f_W)) / (c1o1 + drho); 
+         
+
+      vx2    =   ((-(f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) - (f_TSW - f_BNE)) +
+                 ((f_BN - f_TS)   + (f_TN - f_BS))    + (-(f_SE - f_NW)  + (f_NE - f_SW)) +
+                 (f_N - f_S)) / (c1o1 + drho); 
+
+      vx3    =   (((f_TSE - f_BNW) + (f_TNW - f_BSE)) + ((f_TNE - f_BSW) + (f_TSW - f_BNE)) +
+                 (-(f_BN - f_TS)  + (f_TN - f_BS))   + ((f_TE - f_BW)   - (f_BE - f_TW)) +
+                 (f_T - f_B)) / (c1o1 + drho); 
+
+      //////////////////////////////////////////////////////////////////////////
+      if (evenOrOdd==false)
+      {
+         D.f[dirE   ] = &DD[dirE   *size_Mat];
+         D.f[dirW   ] = &DD[dirW   *size_Mat];
+         D.f[dirN   ] = &DD[dirN   *size_Mat];
+         D.f[dirS   ] = &DD[dirS   *size_Mat];
+         D.f[dirT   ] = &DD[dirT   *size_Mat];
+         D.f[dirB   ] = &DD[dirB   *size_Mat];
+         D.f[dirNE  ] = &DD[dirNE  *size_Mat];
+         D.f[dirSW  ] = &DD[dirSW  *size_Mat];
+         D.f[dirSE  ] = &DD[dirSE  *size_Mat];
+         D.f[dirNW  ] = &DD[dirNW  *size_Mat];
+         D.f[dirTE  ] = &DD[dirTE  *size_Mat];
+         D.f[dirBW  ] = &DD[dirBW  *size_Mat];
+         D.f[dirBE  ] = &DD[dirBE  *size_Mat];
+         D.f[dirTW  ] = &DD[dirTW  *size_Mat];
+         D.f[dirTN  ] = &DD[dirTN  *size_Mat];
+         D.f[dirBS  ] = &DD[dirBS  *size_Mat];
+         D.f[dirBN  ] = &DD[dirBN  *size_Mat];
+         D.f[dirTS  ] = &DD[dirTS  *size_Mat];
+         D.f[dirZERO] = &DD[dirZERO*size_Mat];
+         D.f[dirTNE ] = &DD[dirTNE *size_Mat];
+         D.f[dirTSW ] = &DD[dirTSW *size_Mat];
+         D.f[dirTSE ] = &DD[dirTSE *size_Mat];
+         D.f[dirTNW ] = &DD[dirTNW *size_Mat];
+         D.f[dirBNE ] = &DD[dirBNE *size_Mat];
+         D.f[dirBSW ] = &DD[dirBSW *size_Mat];
+         D.f[dirBSE ] = &DD[dirBSE *size_Mat];
+         D.f[dirBNW ] = &DD[dirBNW *size_Mat];
+      } 
+      else
+      {
+         D.f[dirW   ] = &DD[dirE   *size_Mat];
+         D.f[dirE   ] = &DD[dirW   *size_Mat];
+         D.f[dirS   ] = &DD[dirN   *size_Mat];
+         D.f[dirN   ] = &DD[dirS   *size_Mat];
+         D.f[dirB   ] = &DD[dirT   *size_Mat];
+         D.f[dirT   ] = &DD[dirB   *size_Mat];
+         D.f[dirSW  ] = &DD[dirNE  *size_Mat];
+         D.f[dirNE  ] = &DD[dirSW  *size_Mat];
+         D.f[dirNW  ] = &DD[dirSE  *size_Mat];
+         D.f[dirSE  ] = &DD[dirNW  *size_Mat];
+         D.f[dirBW  ] = &DD[dirTE  *size_Mat];
+         D.f[dirTE  ] = &DD[dirBW  *size_Mat];
+         D.f[dirTW  ] = &DD[dirBE  *size_Mat];
+         D.f[dirBE  ] = &DD[dirTW  *size_Mat];
+         D.f[dirBS  ] = &DD[dirTN  *size_Mat];
+         D.f[dirTN  ] = &DD[dirBS  *size_Mat];
+         D.f[dirTS  ] = &DD[dirBN  *size_Mat];
+         D.f[dirBN  ] = &DD[dirTS  *size_Mat];
+         D.f[dirZERO] = &DD[dirZERO*size_Mat];
+         D.f[dirTNE ] = &DD[dirBSW *size_Mat];
+         D.f[dirTSW ] = &DD[dirBNE *size_Mat];
+         D.f[dirTSE ] = &DD[dirBNW *size_Mat];
+         D.f[dirTNW ] = &DD[dirBSE *size_Mat];
+         D.f[dirBNE ] = &DD[dirTSW *size_Mat];
+         D.f[dirBSW ] = &DD[dirTNE *size_Mat];
+         D.f[dirBSE ] = &DD[dirTNW *size_Mat];
+         D.f[dirBNW ] = &DD[dirTSE *size_Mat];
+      }
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      real f_E_in,  f_W_in,  f_N_in,  f_S_in,  f_T_in,  f_B_in,   f_NE_in,  f_SW_in,  f_SE_in,  f_NW_in,  f_TE_in,  f_BW_in,  f_BE_in,
+         f_TW_in, f_TN_in, f_BS_in, f_BN_in, f_TS_in, f_TNE_in, f_TSW_in, f_TSE_in, f_TNW_in, f_BNE_in, f_BSW_in, f_BSE_in, f_BNW_in;
+      
+      // momentum exchanged with wall at rest
+      real wallMomentumX = 0.0, wallMomentumY = 0.0, wallMomentumZ = 0.0;
+      
+      real q;
+      q = q_dirE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_W_in=f_E;
+         wallMomentumX += f_E+f_W_in;
+      }
+
+      q = q_dirW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_E_in=f_W;
+          wallMomentumX -= f_W+f_E_in;
+      }
+
+      q = q_dirN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_S_in=f_N;
+         wallMomentumY += f_N+f_S_in;
+      }
+
+      q = q_dirS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_N_in=f_S;
+         wallMomentumY -= f_S+f_N_in;
+      }
+
+      q = q_dirT[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_B_in=f_T;
+         wallMomentumZ += f_T+f_B_in;
+      }
+
+      q = q_dirB[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_T_in=f_B;
+         wallMomentumZ -= f_B+f_T_in;
+      }
+
+      q = q_dirNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_SW_in=f_NE;
+         wallMomentumX += f_NE+f_SW_in;
+         wallMomentumY += f_NE+f_SW_in;
+      }
+
+      q = q_dirSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_NE_in=f_SW;
+         wallMomentumX -= f_SW+f_NE_in;
+         wallMomentumY -= f_SW+f_NE_in;
+      }
+
+      q = q_dirSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_NW_in=f_SE;
+         wallMomentumX += f_SE+f_NW_in;
+         wallMomentumY -= f_SE+f_NW_in;
+      }
+
+      q = q_dirNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_SE_in=f_NW;
+         wallMomentumX -= f_NW+f_SE_in;
+         wallMomentumY += f_NW+f_SE_in;
+      }
+
+      q = q_dirTE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_BW_in=f_TE;
+         wallMomentumX += f_TE+f_BW_in;
+         wallMomentumZ += f_TE+f_BW_in;
+      }
+
+      q = q_dirBW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_TE_in=f_BW;
+         wallMomentumX -= f_BW+f_TE_in;
+         wallMomentumZ -= f_BW+f_TE_in;
+      }
+
+      q = q_dirBE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_TW_in=f_BE;
+         wallMomentumX += f_BE+f_TW_in;
+         wallMomentumZ -= f_BE+f_TW_in;
+      }
+
+      q = q_dirTW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_BE_in=f_TW;
+         wallMomentumX -= f_TW+f_BE_in;
+         wallMomentumZ += f_TW+f_BE_in;
+      }
+
+      q = q_dirTN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_BS_in=f_TN;
+         wallMomentumY += f_TN+f_BS_in;
+         wallMomentumZ += f_TN+f_BS_in;
+      }
+
+      q = q_dirBS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_TN_in=f_BS;
+         wallMomentumY -= f_BS+f_TN_in;
+         wallMomentumZ -= f_BS+f_TN_in;
+      }
+
+      q = q_dirBN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_TS_in=f_BN;
+         wallMomentumY += f_BN+f_TS_in;
+         wallMomentumZ -= f_BN+f_TS_in;
+      }
+
+      q = q_dirTS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_BN_in=f_TS;
+         wallMomentumY -= f_TS+f_BN_in;
+         wallMomentumZ += f_TS+f_BN_in;
+      }
+
+      q = q_dirTNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_BSW_in=f_TNE;
+         wallMomentumX += f_TNE+f_BSW_in;
+         wallMomentumY += f_TNE+f_BSW_in;
+         wallMomentumZ += f_TNE+f_BSW_in;
+      }
+
+      q = q_dirBSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_TNE_in=f_BSW;
+         wallMomentumX -= f_BSW+f_TNE_in;
+         wallMomentumY -= f_BSW+f_TNE_in;
+         wallMomentumZ -= f_BSW+f_TNE_in;
+      }
+
+      q = q_dirBNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_TSW_in=f_BNE;
+         wallMomentumX += f_BNE+f_TSW_in;
+         wallMomentumY += f_BNE+f_TSW_in;
+         wallMomentumZ -= f_BNE+f_TSW_in;
+      }
+
+      q = q_dirTSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_BNE_in=f_TSW;
+         wallMomentumX -= f_TSW+f_BNE_in;
+         wallMomentumY -= f_TSW+f_BNE_in;
+         wallMomentumZ += f_TSW+f_BNE_in;
+      }
+
+      q = q_dirTSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_BNW_in=f_TSE;
+         wallMomentumX += f_TSE+f_BNW_in;
+         wallMomentumY -= f_TSE+f_BNW_in;
+         wallMomentumZ += f_TSE+f_BNW_in;
+      }
+
+      q = q_dirBNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_TSE_in=f_BNW;
+         wallMomentumX -= f_BNW+f_TSE_in;
+         wallMomentumY += f_BNW+f_TSE_in;
+         wallMomentumZ -= f_BNW+f_TSE_in;
+      }
+
+      q = q_dirBSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_TNW_in=f_BSE;
+         wallMomentumX += f_BSE+f_TNW_in;
+         wallMomentumY -= f_BSE+f_TNW_in;
+         wallMomentumZ -= f_BSE+f_TNW_in;
+      }
+
+      q = q_dirTNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         f_BSE_in=f_TNW;
+         wallMomentumX -= f_TNW+f_BSE_in;
+         wallMomentumY += f_TNW+f_BSE_in;
+         wallMomentumZ += f_TNW+f_BSE_in;
+      }
+
+      ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      // //Compute wall velocity
+      // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      real VeloX=0.0, VeloY=0.0, VeloZ=0.0; 
+
+      q = 0.5f;
+      real eps = 0.001f;
+
+      iMEM( k, k_N[k], 
+         normalX, normalY, normalZ,
+         vx, vy, vz,
+         vx_el,      vy_el,      vz_el,
+         vx_w_mean,  vy_w_mean,  vz_w_mean,
+         vx1,        vx2,        vx3,
+         c1o1+drho,
+         samplingOffset,
+         q,
+         1.0,
+         eps,
+         z0,
+         hasWallModelMonitor,
+         u_star_monitor,
+         wallMomentumX, wallMomentumY, wallMomentumZ,
+         VeloX, VeloY, VeloZ);
+      
+      // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      // //Add wall velocity and write f's
+      // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      q = q_dirE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirW])[kw] = f_W_in - (c6o1*c2o27*( VeloX     ));
+         wallMomentumX += -(c6o1*c2o27*( VeloX     ));
+      }
+
+      q = q_dirW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirE])[ke] = f_E_in - (c6o1*c2o27*(-VeloX     ));
+         wallMomentumX -= - (c6o1*c2o27*(-VeloX     ));
+      }
+
+      q = q_dirN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirS])[ks] = f_S_in - (c6o1*c2o27*( VeloY     ));
+         wallMomentumY += - (c6o1*c2o27*( VeloY     ));
+      }
+
+      q = q_dirS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirN])[kn] = f_N_in - (c6o1*c2o27*(-VeloY     ));
+         wallMomentumY -=  -(c6o1*c2o27*(-VeloY     ));
+      }
+
+      q = q_dirT[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirB])[kb] = f_B_in - (c6o1*c2o27*( VeloZ     ));
+         wallMomentumZ += - (c6o1*c2o27*( VeloZ     ));
+      }
+
+      q = q_dirB[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirT])[kt] = f_T_in - (c6o1*c2o27*(-VeloZ     ));
+         wallMomentumZ -= -(c6o1*c2o27*(-VeloZ     ));
+      }
+
+      q = q_dirNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirSW])[ksw] = f_SW_in - (c6o1*c1o54*(VeloX+VeloY));
+         wallMomentumX +=  -(c6o1*c1o54*(VeloX+VeloY));
+         wallMomentumY +=  -(c6o1*c1o54*(VeloX+VeloY));
+      }
+
+      q = q_dirSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirNE])[kne] = f_NE_in - (c6o1*c1o54*(-VeloX-VeloY));
+         wallMomentumX -= - (c6o1*c1o54*(-VeloX-VeloY));
+         wallMomentumY -= - (c6o1*c1o54*(-VeloX-VeloY));
+      }
+
+      q = q_dirSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirNW])[knw] = f_NW_in - (c6o1*c1o54*( VeloX-VeloY));
+         wallMomentumX += -(c6o1*c1o54*( VeloX-VeloY));
+         wallMomentumY -= -(c6o1*c1o54*( VeloX-VeloY));
+      }
+
+      q = q_dirNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirSE])[kse] = f_SE_in - (c6o1*c1o54*(-VeloX+VeloY));
+         wallMomentumX -= - (c6o1*c1o54*(-VeloX+VeloY));
+         wallMomentumY += - (c6o1*c1o54*(-VeloX+VeloY));
+      }
+
+      q = q_dirTE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBW])[kbw] = f_BW_in - (c6o1*c1o54*( VeloX+VeloZ));
+         wallMomentumX += - (c6o1*c1o54*( VeloX+VeloZ));
+         wallMomentumZ += - (c6o1*c1o54*( VeloX+VeloZ));
+      }
+
+      q = q_dirBW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTE])[kte] = f_TE_in - (c6o1*c1o54*(-VeloX-VeloZ));
+         wallMomentumX -= - (c6o1*c1o54*(-VeloX-VeloZ));
+         wallMomentumZ -= - (c6o1*c1o54*(-VeloX-VeloZ));
+      }
+
+      q = q_dirBE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTW])[ktw] = f_TW_in - (c6o1*c1o54*( VeloX-VeloZ));
+         wallMomentumX += - (c6o1*c1o54*( VeloX-VeloZ));
+         wallMomentumZ -= - (c6o1*c1o54*( VeloX-VeloZ));
+      }
+
+      q = q_dirTW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBE])[kbe] = f_BE_in - (c6o1*c1o54*(-VeloX+VeloZ));
+         wallMomentumX -= - (c6o1*c1o54*(-VeloX+VeloZ));
+         wallMomentumZ += - (c6o1*c1o54*(-VeloX+VeloZ));
+      }
+
+      q = q_dirTN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBS])[kbs] = f_BS_in - (c6o1*c1o54*( VeloY+VeloZ));
+         wallMomentumY += - (c6o1*c1o54*( VeloY+VeloZ));
+         wallMomentumZ += - (c6o1*c1o54*( VeloY+VeloZ));
+      }
+
+      q = q_dirBS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTN])[ktn] = f_TN_in - (c6o1*c1o54*( -VeloY-VeloZ));
+         wallMomentumY -= - (c6o1*c1o54*( -VeloY-VeloZ));
+         wallMomentumZ -= - (c6o1*c1o54*( -VeloY-VeloZ));
+      }
+
+      q = q_dirBN[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTS])[kts] = f_TS_in - (c6o1*c1o54*( VeloY-VeloZ));
+         wallMomentumY += - (c6o1*c1o54*( VeloY-VeloZ));
+         wallMomentumZ -= - (c6o1*c1o54*( VeloY-VeloZ));
+      }
+
+      q = q_dirTS[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBN])[kbn] = f_BN_in - (c6o1*c1o54*( -VeloY+VeloZ));
+         wallMomentumY -= - (c6o1*c1o54*( -VeloY+VeloZ));
+         wallMomentumZ += - (c6o1*c1o54*( -VeloY+VeloZ));
+      }
+
+      q = q_dirTNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBSW])[kbsw] = f_BSW_in - (c6o1*c1o216*( VeloX+VeloY+VeloZ));
+         wallMomentumX += - (c6o1*c1o216*( VeloX+VeloY+VeloZ));
+         wallMomentumY += - (c6o1*c1o216*( VeloX+VeloY+VeloZ));
+         wallMomentumZ += - (c6o1*c1o216*( VeloX+VeloY+VeloZ));
+      }
+
+      q = q_dirBSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTNE])[ktne] = f_TNE_in - (c6o1*c1o216*(-VeloX-VeloY-VeloZ));
+         wallMomentumX -= - (c6o1*c1o216*(-VeloX-VeloY-VeloZ));
+         wallMomentumY -= - (c6o1*c1o216*(-VeloX-VeloY-VeloZ));
+         wallMomentumZ -= - (c6o1*c1o216*(-VeloX-VeloY-VeloZ));
+      }
+
+      q = q_dirBNE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTSW])[ktsw] = f_TSW_in - (c6o1*c1o216*( VeloX+VeloY-VeloZ));
+         wallMomentumX += - (c6o1*c1o216*( VeloX+VeloY-VeloZ));
+         wallMomentumY += - (c6o1*c1o216*( VeloX+VeloY-VeloZ));
+         wallMomentumZ -= - (c6o1*c1o216*( VeloX+VeloY-VeloZ));
+      }
+
+      q = q_dirTSW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBNE])[kbne] = f_BNE_in - (c6o1*c1o216*(-VeloX-VeloY+VeloZ));
+         wallMomentumX -= - (c6o1*c1o216*(-VeloX-VeloY+VeloZ));
+         wallMomentumY -= - (c6o1*c1o216*(-VeloX-VeloY+VeloZ));
+         wallMomentumZ += - (c6o1*c1o216*(-VeloX-VeloY+VeloZ));
+      }
+
+      q = q_dirTSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBNW])[kbnw] = f_BNW_in - (c6o1*c1o216*( VeloX-VeloY+VeloZ));
+         wallMomentumX += - (c6o1*c1o216*( VeloX-VeloY+VeloZ));
+         wallMomentumY -= - (c6o1*c1o216*( VeloX-VeloY+VeloZ));
+         wallMomentumZ += - (c6o1*c1o216*( VeloX-VeloY+VeloZ));
+      }
+
+      q = q_dirBNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTSE])[ktse] = f_TSE_in - (c6o1*c1o216*(-VeloX+VeloY-VeloZ));
+         wallMomentumX -= - (c6o1*c1o216*(-VeloX+VeloY-VeloZ));
+         wallMomentumY += - (c6o1*c1o216*(-VeloX+VeloY-VeloZ));
+         wallMomentumZ -= - (c6o1*c1o216*(-VeloX+VeloY-VeloZ));
+      }
+
+      q = q_dirBSE[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirTNW])[ktnw] = f_TNW_in - (c6o1*c1o216*( VeloX-VeloY-VeloZ));
+         wallMomentumX += - (c6o1*c1o216*( VeloX-VeloY-VeloZ));
+         wallMomentumY -= - (c6o1*c1o216*( VeloX-VeloY-VeloZ));
+         wallMomentumZ -= - (c6o1*c1o216*( VeloX-VeloY-VeloZ));
+      }
+
+      q = q_dirTNW[k];
+      if (q>=c0o1 && q<=c1o1)
+      {
+         (D.f[dirBSE])[kbse] = f_BSE_in - (c6o1*c1o216*(-VeloX+VeloY+VeloZ));
+         wallMomentumX -= - (c6o1*c1o216*(-VeloX+VeloY+VeloZ));
+         wallMomentumY += - (c6o1*c1o216*(-VeloX+VeloY+VeloZ));
+         wallMomentumZ += - (c6o1*c1o216*(-VeloX+VeloY+VeloZ));
+      }
+
+      if(hasWallModelMonitor)
+      {
+         Fx_monitor[k] = wallMomentumX;
+         Fy_monitor[k] = wallMomentumY;
+         Fz_monitor[k] = wallMomentumZ;
+      }
+
+   }
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosity.cu b/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosity.cu
index b232ca7ef22d607420cc4cbbfb39cccb41618868..d510a4fe6f0f842d7882bef2eb4804461e986026 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosity.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/TurbulentViscosity.cu
@@ -69,7 +69,7 @@ extern "C" __global__ void calcAMD(real* vx,
                         (dvxdx*dvzdx + dvxdy*dvzdy + dvxdz*dvzdz) * (dvxdz+dvzdx) + 
                         (dvydx*dvzdx + dvydy*dvzdy + dvydz*dvzdz) * (dvydz+dvzdy);
 
-    turbulentViscosity[k] = -SGSConstant*enumerator/denominator;
+    turbulentViscosity[k] = max(c0o1,-SGSConstant*enumerator)/denominator;
 }
 
 extern "C" void calcTurbulentViscosityAMD(Parameter* para, int level)
@@ -88,5 +88,6 @@ extern "C" void calcTurbulentViscosityAMD(Parameter* para, int level)
         para->getParD(level)->size_Mat_SP,
         para->getSGSConstant()
     );
+    getLastCudaError("calcAMD execution failed");
 }
     
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cu
index b457a782e5aa4922b298ed1500c31b230950cd6b..f2f02c6df050166259dc23f816b0c2829f85dc0c 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/TurbulentViscosityKernels/FluidFlow/Compressible/CumulantK17chim/TurbulentViscosityCumulantK17CompChim_Device.cu
@@ -26,9 +26,17 @@
 //  You should have received a copy of the GNU General Public License along
 //  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
 //
-//! \file Cumulant27chim.cu
-//! \ingroup GPU
-//! \author Martin Schoenherr
+//! \file TurbulentViscosityCumulantK17CompChim_Device.cu
+//! \author Henry Korb, Henrik Asmuth
+//! \date 16/05/2022
+//! \brief CumulantK17CompChim kernel by Martin Schönherr that inlcudes turbulent viscosity and other small mods.
+//!
+//! Additions to CumulantK17CompChim:
+//!     - can incorporate local body force 
+//!     - when applying a local body force, the total round of error of forcing+bodyforce is saved and added in next time step
+//!     - uses turbulent viscosity that is computed in separate kernel (as of now AMD)
+//!     - saves macroscopic values (needed for instance for probes, AMD, and actuator models)
+//!
 //=======================================================================================
 /* Device code */
 #include "LBM/LB.h" 
@@ -227,19 +235,46 @@ extern "C" __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim(
         real fz = forces[2];
 
         if( bodyForce ){
-            fx += bodyForceX[k];
+            fx += bodyForceX[k]; 
             fy += bodyForceY[k];
             fz += bodyForceZ[k];
 
-            //Reset body force
-            bodyForceX[k] = 0.0f;
-            bodyForceY[k] = 0.0f;
-            bodyForceZ[k] = 0.0f;
+            real vx = vvx;
+            real vy = vvy;
+            real vz = vvz;
+            real acc_x = fx * c1o2 / factor;
+            real acc_y = fy * c1o2 / factor;
+            real acc_z = fz * c1o2 / factor;
+
+            vvx += acc_x;
+            vvy += acc_y;
+            vvz += acc_z;
+            
+        //    // Reset body force. To be used when not using round-off correction.
+        // bodyForceX[k] = 0.0f;
+        // bodyForceY[k] = 0.0f;
+        // bodyForceZ[k] = 0.0f;
+
+            ////////////////////////////////////////////////////////////////////////////////////
+            //!> Round-off correction
+            //!
+            //!> Similar to Kahan summation algorithm (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
+            //!> Essentially computes the round-off error of the applied force and adds it in the next time step as a compensation.
+            //!> Seems to be necesseary at very high Re boundary layers, where the forcing and velocity can  
+            //!> differ by several orders of magnitude.
+            //!> \note 16/05/2022: Testing, still ongoing! 
+            //!
+            bodyForceX[k] = (acc_x-(vvx-vx))*factor*c2o1;
+            bodyForceY[k] = (acc_y-(vvy-vy))*factor*c2o1;
+            bodyForceZ[k] = (acc_z-(vvz-vz))*factor*c2o1;
+        }
+        else{
+            vvx += fx * c1o2 / factor;
+            vvy += fy * c1o2 / factor;
+            vvz += fz * c1o2 / factor;
         }
         
-        vvx += fx * c1o2 / factor;
-        vvy += fy * c1o2 / factor;
-        vvz += fz * c1o2 / factor;
+
         ////////////////////////////////////////////////////////////////////////////////////
         // calculate the square of velocities for this lattice node
         real vx2 = vvx * vvx;
@@ -315,10 +350,10 @@ extern "C" __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim(
         //!  - Fifth order cumulants \f$ C_{221}, C_{212}, C_{122}\f$: \f$\omega_9=O5=1.0\f$.
         //!  - Sixth order cumulant \f$ C_{222}\f$: \f$\omega_{10}=O6=1.0\f$.
         //!
-                ////////////////////////////////////////////////////////////////////////////////////
+        ////////////////////////////////////////////////////////////////////////////////////
         //! - Calculate modified omega with turbulent viscosity
         //!
-        real omega = omega_in / (c1o1 + c3o1*omega_in*max(c0o1, turbulentViscosity[k]));
+        real omega = omega_in / (c1o1 + c3o1*omega_in*turbulentViscosity[k]);
         ////////////////////////////////////////////////////////////
         // 2.
         real OxxPyyPzz = c1o1;
@@ -429,6 +464,24 @@ extern "C" __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim(
         real dxux = c1o2 * (-omega) * (mxxMyy + mxxMzz) + c1o2 * OxxPyyPzz * (mfaaa - mxxPyyPzz);
         real dyuy = dxux + omega * c3o2 * mxxMyy;
         real dzuz = dxux + omega * c3o2 * mxxMzz;
+
+        //Smagorinsky for debugging
+        // if(true)
+        // {   
+            // if(false && k==99976)
+            // {
+            //     printf("dudz+dwdu: \t %1.14f \n", Dxz );
+            //     printf("dvdz+dudy: \t %1.14f \n", Dxy );  
+            //     printf("dwdy+dvdz: \t %1.14f \n", Dyz );  
+            //     printf("nu_t * dudz+dwdu: \t %1.14f \n", turbulentViscosity[k]*Dxz );
+            //     printf("nu_t * dvdz+dudy: \t %1.14f \n", turbulentViscosity[k]*Dxy );  
+            //     printf("nu_t * dwdy+dvdz: \t %1.14f \n", turbulentViscosity[k]*Dyz );      
+            // } 
+        //     real Sbar = sqrt(c2o1*(dxux*dxux+dyuy*dyuy+dzuz*dzuz)+Dxy*Dxy+Dxz*Dxz+Dyz*Dyz);
+        //     real Cs = 0.08f;
+        //     turbulentViscosity[k] = Cs*Cs*Sbar;
+        // }
+
         ////////////////////////////////////////////////////////////
         //! - Relaxation of second order cumulants with correction terms according to Eq. (33)-(35) in
         //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
@@ -670,4 +723,875 @@ extern "C" __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim(
 
 
     }
-}
\ No newline at end of file
+}
+
+
+
+
+//WORK IN PROGRESS: Incorporating DistributionWrapper in kernel.....
+
+// //=======================================================================================
+// // ____          ____    __    ______     __________   __      __       __        __         
+// // \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+// //  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+// //   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+// //    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+// //     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+// //      \    \  |    |   ________________________________________________________________    
+// //       \    \ |    |  |  ______________________________________________________________|   
+// //        \    \|    |  |  |         __          __     __     __     ______      _______    
+// //         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+// //          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+// //           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+// //            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+// //
+// //  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+// //  redistribute it and/or modify it under the terms of the GNU General Public
+// //  License as published by the Free Software Foundation, either version 3 of 
+// //  the License, or (at your option) any later version.
+// //  
+// //  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+// //  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+// //  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+// //  for more details.
+// //  
+// //  You should have received a copy of the GNU General Public License along
+// //  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+// //
+// //! \file TurbulentViscosityCumulantK17CompChim_Device.cu
+// //! \author Henry Korb, Henrik Asmuth
+// //! \date 16/05/2022
+// //! \brief CumulantK17CompChim kernel by Martin Schönherr that inlcudes turbulent viscosity and other small mods.
+// //!
+// //! Additions to CumulantK17CompChim:
+// //!     - can incorporate local body force 
+// //!     - when applying a local body force, the total round of error of forcing+bodyforce is saved and added in next time step
+// //!     - uses turbulent viscosity that is computed in separate kernel (as of now AMD)
+// //!     - saves macroscopic values (needed for instance for probes, AMD, and actuator models)
+// //!
+// //=======================================================================================
+// /* Device code */
+// #include "LBM/LB.h" 
+// #include "LBM/D3Q27.h"
+// #include <lbm/constants/NumericConstants.h>
+
+// using namespace vf::lbm::constant;
+// #include "Kernel/ChimeraTransformation.h"
+
+// #include "Kernel/Utilities/DistributionHelper.cuh"
+
+// #include "lbm/MacroscopicQuantities.h"
+
+// ////////////////////////////////////////////////////////////////////////////////
+// extern "C" __global__ void LB_Kernel_TurbulentViscosityCumulantK17CompChim(
+// 	real omega_in,
+// 	uint* typeOfGridNode,
+// 	uint* neighborX,
+// 	uint* neighborY,
+// 	uint* neighborZ,
+// 	real* distributions,
+//     real* rho,
+//     real* vx,
+//     real* vy,
+//     real* vz,
+//     real* turbulentViscosity,
+// 	unsigned long size_Mat,
+// 	int level,
+//     bool bodyForce,
+// 	real* forces,
+//     real* bodyForceX,
+//     real* bodyForceY,
+//     real* bodyForceZ,
+// 	real* quadricLimiters,
+// 	bool isEvenTimestep)
+// {
+//     //////////////////////////////////////////////////////////////////////////
+//     //! Cumulant K17 Kernel is based on \ref
+//     //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
+//     //! ]</b></a> and \ref <a href="https://doi.org/10.1016/j.jcp.2017.07.004"><b>[ M. Geier et al. (2017),
+//     //! DOI:10.1016/j.jcp.2017.07.004 ]</b></a>
+//     //!
+//     //! The cumulant kernel is executed in the following steps
+//     //!
+//     ////////////////////////////////////////////////////////////////////////////////
+//     //! - Get node index coordinates from threadIdx, blockIdx, blockDim and gridDim.
+//     //!
+
+//     // const unsigned x = threadIdx.x;
+//     // const unsigned y = blockIdx.x;
+//     // const unsigned z = blockIdx.y;
+
+//     // const unsigned nx = blockDim.x;
+//     // const unsigned ny = gridDim.x;
+
+//     // const unsigned k = nx * (ny * z + y) + x;
+//     const unsigned k = vf::gpu::getNodeIndex();
+//     //////////////////////////////////////////////////////////////////////////
+//     // run for all indices in size_Mat and fluid nodes
+//     // if ((k < size_Mat) && (typeOfGridNode[k] == GEO_FLUID)) {
+//     if ((k < size_Mat) && vf::gpu::isValidFluidNode(typeOfGridNode[k])) {
+//         //////////////////////////////////////////////////////////////////////////
+//         //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on
+//         //! timestep is based on the esoteric twist algorithm \ref <a
+//         //! href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
+//         //! DOI:10.3390/computation5020019 ]</b></a>
+//         //!
+
+//         vf::gpu::DistributionWrapper distr_wrapper( distributions, size_Mat, 
+//                                                     isEvenTimestep, k, 
+//                                                     neighborX, neighborY, neighborZ);
+
+//         Distributions27 dist;
+//         if (isEvenTimestep) {
+//             dist.f[dirE]    = &distributions[dirE * size_Mat];
+//             dist.f[dirW]    = &distributions[dirW * size_Mat];
+//             dist.f[dirN]    = &distributions[dirN * size_Mat];
+//             dist.f[dirS]    = &distributions[dirS * size_Mat];
+//             dist.f[dirT]    = &distributions[dirT * size_Mat];
+//             dist.f[dirB]    = &distributions[dirB * size_Mat];
+//             dist.f[dirNE]   = &distributions[dirNE * size_Mat];
+//             dist.f[dirSW]   = &distributions[dirSW * size_Mat];
+//             dist.f[dirSE]   = &distributions[dirSE * size_Mat];
+//             dist.f[dirNW]   = &distributions[dirNW * size_Mat];
+//             dist.f[dirTE]   = &distributions[dirTE * size_Mat];
+//             dist.f[dirBW]   = &distributions[dirBW * size_Mat];
+//             dist.f[dirBE]   = &distributions[dirBE * size_Mat];
+//             dist.f[dirTW]   = &distributions[dirTW * size_Mat];
+//             dist.f[dirTN]   = &distributions[dirTN * size_Mat];
+//             dist.f[dirBS]   = &distributions[dirBS * size_Mat];
+//             dist.f[dirBN]   = &distributions[dirBN * size_Mat];
+//             dist.f[dirTS]   = &distributions[dirTS * size_Mat];
+//             dist.f[dirZERO] = &distributions[dirZERO * size_Mat];
+//             dist.f[dirTNE]  = &distributions[dirTNE * size_Mat];
+//             dist.f[dirTSW]  = &distributions[dirTSW * size_Mat];
+//             dist.f[dirTSE]  = &distributions[dirTSE * size_Mat];
+//             dist.f[dirTNW]  = &distributions[dirTNW * size_Mat];
+//             dist.f[dirBNE]  = &distributions[dirBNE * size_Mat];
+//             dist.f[dirBSW]  = &distributions[dirBSW * size_Mat];
+//             dist.f[dirBSE]  = &distributions[dirBSE * size_Mat];
+//             dist.f[dirBNW]  = &distributions[dirBNW * size_Mat];
+//         } else {
+//             dist.f[dirW]    = &distributions[dirE * size_Mat];
+//             dist.f[dirE]    = &distributions[dirW * size_Mat];
+//             dist.f[dirS]    = &distributions[dirN * size_Mat];
+//             dist.f[dirN]    = &distributions[dirS * size_Mat];
+//             dist.f[dirB]    = &distributions[dirT * size_Mat];
+//             dist.f[dirT]    = &distributions[dirB * size_Mat];
+//             dist.f[dirSW]   = &distributions[dirNE * size_Mat];
+//             dist.f[dirNE]   = &distributions[dirSW * size_Mat];
+//             dist.f[dirNW]   = &distributions[dirSE * size_Mat];
+//             dist.f[dirSE]   = &distributions[dirNW * size_Mat];
+//             dist.f[dirBW]   = &distributions[dirTE * size_Mat];
+//             dist.f[dirTE]   = &distributions[dirBW * size_Mat];
+//             dist.f[dirTW]   = &distributions[dirBE * size_Mat];
+//             dist.f[dirBE]   = &distributions[dirTW * size_Mat];
+//             dist.f[dirBS]   = &distributions[dirTN * size_Mat];
+//             dist.f[dirTN]   = &distributions[dirBS * size_Mat];
+//             dist.f[dirTS]   = &distributions[dirBN * size_Mat];
+//             dist.f[dirBN]   = &distributions[dirTS * size_Mat];
+//             dist.f[dirZERO] = &distributions[dirZERO * size_Mat];
+//             dist.f[dirBSW]  = &distributions[dirTNE * size_Mat];
+//             dist.f[dirBNE]  = &distributions[dirTSW * size_Mat];
+//             dist.f[dirBNW]  = &distributions[dirTSE * size_Mat];
+//             dist.f[dirBSE]  = &distributions[dirTNW * size_Mat];
+//             dist.f[dirTSW]  = &distributions[dirBNE * size_Mat];
+//             dist.f[dirTNE]  = &distributions[dirBSW * size_Mat];
+//             dist.f[dirTNW]  = &distributions[dirBSE * size_Mat];
+//             dist.f[dirTSE]  = &distributions[dirBNW * size_Mat];
+//         }
+//         ////////////////////////////////////////////////////////////////////////////////
+//         //! - Set neighbor indices (necessary for indirect addressing)
+//         uint kw   = neighborX[k];
+//         uint ks   = neighborY[k];
+//         uint kb   = neighborZ[k];
+//         uint ksw  = neighborY[kw];
+//         uint kbw  = neighborZ[kw];
+//         uint kbs  = neighborZ[ks];
+//         uint kbsw = neighborZ[ksw];
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - Set local distributions
+//         //!
+
+//         // real mfcbb = distr_wrapper.distribution.f[dirE];
+//         // real mfabb = distr_wrapper.distribution.f[dirW];
+//         // real mfbcb = distr_wrapper.distribution.f[dirN];
+//         // real mfbab = distr_wrapper.distribution.f[dirS];
+//         // real mfbbc = distr_wrapper.distribution.f[dirT];
+//         // real mfbba = distr_wrapper.distribution.f[dirB];
+//         // real mfccb = distr_wrapper.distribution.f[dirNE];
+//         // real mfaab = distr_wrapper.distribution.f[dirSW];
+//         // real mfcab = distr_wrapper.distribution.f[dirSE];
+//         // real mfacb = distr_wrapper.distribution.f[dirNW];
+//         // real mfcbc = distr_wrapper.distribution.f[dirTE];
+//         // real mfaba = distr_wrapper.distribution.f[dirBW];
+//         // real mfcba = distr_wrapper.distribution.f[dirBE];
+//         // real mfabc = distr_wrapper.distribution.f[dirTW];
+//         // real mfbcc = distr_wrapper.distribution.f[dirTN];
+//         // real mfbaa = distr_wrapper.distribution.f[dirBS];
+//         // real mfbca = distr_wrapper.distribution.f[dirBN];
+//         // real mfbac = distr_wrapper.distribution.f[dirTS];
+//         // real mfbbb = distr_wrapper.distribution.f[dirZERO];
+//         // real mfccc = distr_wrapper.distribution.f[dirTNE];
+//         // real mfaac = distr_wrapper.distribution.f[dirTSW];
+//         // real mfcac = distr_wrapper.distribution.f[dirTSE];
+//         // real mfacc = distr_wrapper.distribution.f[dirTNW];
+//         // real mfcca = distr_wrapper.distribution.f[dirBNE];
+//         // real mfaaa = distr_wrapper.distribution.f[dirBSW];
+//         // real mfcaa = distr_wrapper.distribution.f[dirBSE];
+//         // real mfaca = distr_wrapper.distribution.f[dirBNW];
+
+        
+//         real mfcbb = (dist.f[dirE])[k];
+//         real mfabb = (dist.f[dirW])[kw];
+//         real mfbcb = (dist.f[dirN])[k];
+//         real mfbab = (dist.f[dirS])[ks];
+//         real mfbbc = (dist.f[dirT])[k];
+//         real mfbba = (dist.f[dirB])[kb];
+//         real mfccb = (dist.f[dirNE])[k];
+//         real mfaab = (dist.f[dirSW])[ksw];
+//         real mfcab = (dist.f[dirSE])[ks];
+//         real mfacb = (dist.f[dirNW])[kw];
+//         real mfcbc = (dist.f[dirTE])[k];
+//         real mfaba = (dist.f[dirBW])[kbw];
+//         real mfcba = (dist.f[dirBE])[kb];
+//         real mfabc = (dist.f[dirTW])[kw];
+//         real mfbcc = (dist.f[dirTN])[k];
+//         real mfbaa = (dist.f[dirBS])[kbs];
+//         real mfbca = (dist.f[dirBN])[kb];
+//         real mfbac = (dist.f[dirTS])[ks];
+//         real mfbbb = (dist.f[dirZERO])[k];
+//         real mfccc = (dist.f[dirTNE])[k];
+//         real mfaac = (dist.f[dirTSW])[ksw];
+//         real mfcac = (dist.f[dirTSE])[ks];
+//         real mfacc = (dist.f[dirTNW])[kw];
+//         real mfcca = (dist.f[dirBNE])[kb];
+//         real mfaaa = (dist.f[dirBSW])[kbsw];
+//         real mfcaa = (dist.f[dirBSE])[kbs];
+//         real mfaca = (dist.f[dirBNW])[kbw];
+        
+//         //////////////////////////////////////////////////////(unsigned long)//////////////////////////////
+//         //! - Calculate density and velocity using pyramid summation for low round-off errors as in Eq. (J1)-(J3) \ref
+//         //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+//         //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+//         //!
+//         // real drho = ((((mfccc + mfaaa) + (mfaca + mfcac)) + ((mfacc + mfcaa) + (mfaac + mfcca))) +
+//         //              (((mfbac + mfbca) + (mfbaa + mfbcc)) + ((mfabc + mfcba) + (mfaba + mfcbc)) +
+//         //               ((mfacb + mfcab) + (mfaab + mfccb))) +
+//         //              ((mfabb + mfcbb) + (mfbab + mfbcb) + (mfbba + mfbbc))) +
+//         //             mfbbb;
+//         real drho = vf::lbm::getDensity(distr_wrapper.distribution.f);
+
+//         real rrho   = c1o1 + drho;
+//         real OOrho = c1o1 / rrho;
+
+//         // real vvx = ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfcaa - mfacc) + (mfcca - mfaac))) +
+//         //             (((mfcba - mfabc) + (mfcbc - mfaba)) + ((mfcab - mfacb) + (mfccb - mfaab))) + (mfcbb - mfabb)) *
+//         //            OOrho;
+//         real vvx = vf::lbm::getCompressibleVelocityX1(distr_wrapper.distribution.f, drho);
+//         // real vvy = ((((mfccc - mfaaa) + (mfaca - mfcac)) + ((mfacc - mfcaa) + (mfcca - mfaac))) +
+//         //             (((mfbca - mfbac) + (mfbcc - mfbaa)) + ((mfacb - mfcab) + (mfccb - mfaab))) + (mfbcb - mfbab)) *
+//         //            OOrho;
+//         real vvy = vf::lbm::getCompressibleVelocityX2(distr_wrapper.distribution.f, drho);
+//         // real vvz = ((((mfccc - mfaaa) + (mfcac - mfaca)) + ((mfacc - mfcaa) + (mfaac - mfcca))) +
+//         //             (((mfbac - mfbca) + (mfbcc - mfbaa)) + ((mfabc - mfcba) + (mfcbc - mfaba))) + (mfbbc - mfbba)) *
+//         //            OOrho;
+//         real vvz = vf::lbm::getCompressibleVelocityX3(distr_wrapper.distribution.f, drho);
+//         // if(k==100000){printf("%f \t %f \t%f \t%f \n\n", drho, vvx, vvz, vvy);}
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - Add half of the acceleration (body force) to the velocity as in Eq. (42) \ref
+//         //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+//         //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+//         //!
+//         real factor = c1o1;
+//         for (size_t i = 1; i <= level; i++) {
+//             factor *= c2o1;
+//         }
+        
+//         real fx = forces[0];
+//         real fy = forces[1];
+//         real fz = forces[2];
+
+//         if( bodyForce ){
+//             fx += bodyForceX[k]; 
+//             fy += bodyForceY[k];
+//             fz += bodyForceZ[k];
+
+//             real vx = vvx;
+//             real vy = vvy;
+//             real vz = vvz;
+//             real acc_x = fx * c1o2 / factor;
+//             real acc_y = fy * c1o2 / factor;
+//             real acc_z = fz * c1o2 / factor;
+
+//             vvx += acc_x;
+//             vvy += acc_y;
+//             vvz += acc_z;
+            
+//         //    // Reset body force. To be used when not using round-off correction.
+//         // bodyForceX[k] = 0.0f;
+//         // bodyForceY[k] = 0.0f;
+//         // bodyForceZ[k] = 0.0f;
+
+//             ////////////////////////////////////////////////////////////////////////////////////
+//             //!> Round-off correction
+//             //!
+//             //!> Similar to Kahan summation algorithm (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
+//             //!> Essentially computes the round-off error of the applied force and adds it in the next time step as a compensation.
+//             //!> Seems to be necesseary at very high Re boundary layers, where the forcing and velocity can  
+//             //!> differ by several orders of magnitude.
+//             //!> \note 16/05/2022: Testing, still ongoing! 
+//             //!
+//             bodyForceX[k] = (acc_x-(double)(vvx-vx))*factor*c2o1;
+//             bodyForceY[k] = (acc_y-(double)(vvy-vy))*factor*c2o1;
+//             bodyForceZ[k] = (acc_z-(double)(vvz-vz))*factor*c2o1;
+
+//         }
+//         else{
+//             vvx += fx * c1o2 / factor;
+//             vvy += fy * c1o2 / factor;
+//             vvz += fz * c1o2 / factor;
+//         }
+        
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         // calculate the square of velocities for this lattice node
+//         real vx2 = vvx * vvx;
+//         real vy2 = vvy * vvy;
+//         real vz2 = vvz * vvz;
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - Set relaxation limiters for third order cumulants to default value \f$ \lambda=0.001 \f$ according to
+//         //! section 6 in \ref <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+//         //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//         //!
+//         real wadjust;
+//         real qudricLimitP = quadricLimiters[0];
+//         real qudricLimitM = quadricLimiters[1];
+//         real qudricLimitD = quadricLimiters[2];
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - Chimera transform from well conditioned distributions to central moments as defined in Appendix J in \ref
+//         //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+//         //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> see also Eq. (6)-(14) in \ref <a
+//         //! href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
+//         //! ]</b></a>
+//         //!
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         // Z - Dir
+//         forwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
+//         forwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2, c9o1, c1o9);
+//         forwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
+//         forwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2, c9o1, c1o9);
+//         forwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2, c9o4, c4o9);
+//         forwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2, c9o1, c1o9);
+//         forwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
+//         forwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2, c9o1, c1o9);
+//         forwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         // Y - Dir
+//         forwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2, c6o1, c1o6);
+//         forwardChimera(mfaab, mfabb, mfacb, vvy, vy2);
+//         forwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
+//         forwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2, c3o2, c2o3);
+//         forwardChimera(mfbab, mfbbb, mfbcb, vvy, vy2);
+//         forwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2, c9o2, c2o9);
+//         forwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2, c6o1, c1o6);
+//         forwardChimera(mfcab, mfcbb, mfccb, vvy, vy2);
+//         forwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         // X - Dir
+//         forwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
+//         forwardChimera(mfaba, mfbba, mfcba, vvx, vx2);
+//         forwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
+//         forwardChimera(mfaab, mfbab, mfcab, vvx, vx2);
+//         forwardChimera(mfabb, mfbbb, mfcbb, vvx, vx2);
+//         forwardChimera(mfacb, mfbcb, mfccb, vvx, vx2);
+//         forwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
+//         forwardChimera(mfabc, mfbbc, mfcbc, vvx, vx2);
+//         forwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c3o1, c1o9);
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - Setting relaxation rates for non-hydrodynamic cumulants (default values). Variable names and equations
+//         //! according to <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+//         //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//         //!  => [NAME IN PAPER]=[NAME IN CODE]=[DEFAULT VALUE].
+//         //!  - Trace of second order cumulants \f$ C_{200}+C_{020}+C_{002} \f$ used to adjust bulk
+//         //!  viscosity:\f$\omega_2=OxxPyyPzz=1.0 \f$.
+//         //!  - Third order cumulants \f$ C_{120}+C_{102}, C_{210}+C_{012}, C_{201}+C_{021} \f$: \f$ \omega_3=OxyyPxzz
+//         //!  \f$ set according to Eq. (111) with simplifications assuming \f$ \omega_2=1.0\f$.
+//         //!  - Third order cumulants \f$ C_{120}-C_{102}, C_{210}-C_{012}, C_{201}-C_{021} \f$: \f$ \omega_4 = OxyyMxzz
+//         //!  \f$ set according to Eq. (112) with simplifications assuming \f$ \omega_2 = 1.0\f$.
+//         //!  - Third order cumulants \f$ C_{111} \f$: \f$ \omega_5 = Oxyz \f$ set according to Eq. (113) with
+//         //!  simplifications assuming \f$ \omega_2 = 1.0\f$  (modify for different bulk viscosity).
+//         //!  - Fourth order cumulants \f$ C_{220}, C_{202}, C_{022}, C_{211}, C_{121}, C_{112} \f$: for simplification
+//         //!  all set to the same default value \f$ \omega_6=\omega_7=\omega_8=O4=1.0 \f$.
+//         //!  - Fifth order cumulants \f$ C_{221}, C_{212}, C_{122}\f$: \f$\omega_9=O5=1.0\f$.
+//         //!  - Sixth order cumulant \f$ C_{222}\f$: \f$\omega_{10}=O6=1.0\f$.
+//         //!
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - Calculate modified omega with turbulent viscosity
+//         //!
+//         real omega = omega_in / (c1o1 + c3o1*omega_in*turbulentViscosity[k]);
+//         ////////////////////////////////////////////////////////////
+//         // 2.
+//         real OxxPyyPzz = c1o1;
+//         ////////////////////////////////////////////////////////////
+//         // 3.
+//         real OxyyPxzz = c8o1 * (-c2o1 + omega) * (c1o1 + c2o1 * omega) / (-c8o1 - c14o1 * omega + c7o1 * omega * omega);
+//         real OxyyMxzz =
+//             c8o1 * (-c2o1 + omega) * (-c7o1 + c4o1 * omega) / (c56o1 - c50o1 * omega + c9o1 * omega * omega);
+//         real Oxyz = c24o1 * (-c2o1 + omega) * (-c2o1 - c7o1 * omega + c3o1 * omega * omega) /
+//                     (c48o1 + c152o1 * omega - c130o1 * omega * omega + c29o1 * omega * omega * omega);
+//         ////////////////////////////////////////////////////////////
+//         // 4.
+//         real O4 = c1o1;
+//         ////////////////////////////////////////////////////////////
+//         // 5.
+//         real O5 = c1o1;
+//         ////////////////////////////////////////////////////////////
+//         // 6.
+//         real O6 = c1o1;
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - A and B: parameters for fourth order convergence of the diffusion term according to Eq. (114) and (115)
+//         //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+//         //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a> with simplifications assuming \f$ \omega_2 = 1.0 \f$ (modify for
+//         //! different bulk viscosity).
+//         //!
+//         real A = (c4o1 + c2o1 * omega - c3o1 * omega * omega) / (c2o1 - c7o1 * omega + c5o1 * omega * omega);
+//         real B = (c4o1 + c28o1 * omega - c14o1 * omega * omega) / (c6o1 - c21o1 * omega + c15o1 * omega * omega);
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - Compute cumulants from central moments according to Eq. (20)-(23) in
+//         //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+//         //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//         //!
+//         ////////////////////////////////////////////////////////////
+//         // 4.
+//         real CUMcbb = mfcbb - ((mfcaa + c1o3) * mfabb + c2o1 * mfbba * mfbab) * OOrho;
+//         real CUMbcb = mfbcb - ((mfaca + c1o3) * mfbab + c2o1 * mfbba * mfabb) * OOrho;
+//         real CUMbbc = mfbbc - ((mfaac + c1o3) * mfbba + c2o1 * mfbab * mfabb) * OOrho;
+
+//         real CUMcca =
+//             mfcca - (((mfcaa * mfaca + c2o1 * mfbba * mfbba) + c1o3 * (mfcaa + mfaca)) * OOrho - c1o9 * (drho * OOrho));
+//         real CUMcac =
+//             mfcac - (((mfcaa * mfaac + c2o1 * mfbab * mfbab) + c1o3 * (mfcaa + mfaac)) * OOrho - c1o9 * (drho * OOrho));
+//         real CUMacc =
+//             mfacc - (((mfaac * mfaca + c2o1 * mfabb * mfabb) + c1o3 * (mfaac + mfaca)) * OOrho - c1o9 * (drho * OOrho));
+//         ////////////////////////////////////////////////////////////
+//         // 5.
+//         real CUMbcc =
+//             mfbcc - ((mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb + c2o1 * (mfbab * mfacb + mfbba * mfabc)) +
+//                      c1o3 * (mfbca + mfbac)) *
+//                         OOrho;
+//         real CUMcbc =
+//             mfcbc - ((mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb + c2o1 * (mfabb * mfcab + mfbba * mfbac)) +
+//                      c1o3 * (mfcba + mfabc)) *
+//                         OOrho;
+//         real CUMccb =
+//             mfccb - ((mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb + c2o1 * (mfbab * mfbca + mfabb * mfcba)) +
+//                      c1o3 * (mfacb + mfcab)) *
+//                         OOrho;
+//         ////////////////////////////////////////////////////////////
+//         // 6.
+//         real CUMccc = mfccc + ((-c4o1 * mfbbb * mfbbb - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca) -
+//                                 c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc) -
+//                                 c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) *
+//                                    OOrho +
+//                                (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac) +
+//                                 c2o1 * (mfcaa * mfaca * mfaac) + c16o1 * mfbba * mfbab * mfabb) *
+//                                    OOrho * OOrho -
+//                                c1o3 * (mfacc + mfcac + mfcca) * OOrho - c1o9 * (mfcaa + mfaca + mfaac) * OOrho +
+//                                (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba) +
+//                                 (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 * (mfaac + mfaca + mfcaa)) *
+//                                    OOrho * OOrho * c2o3 +
+//                                c1o27 * ((drho * drho - drho) * OOrho * OOrho));
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - Compute linear combinations of second and third order cumulants
+//         //!
+//         ////////////////////////////////////////////////////////////
+//         // 2.
+//         real mxxPyyPzz = mfcaa + mfaca + mfaac;
+//         real mxxMyy    = mfcaa - mfaca;
+//         real mxxMzz    = mfcaa - mfaac;
+//         ////////////////////////////////////////////////////////////
+//         // 3.
+//         real mxxyPyzz = mfcba + mfabc;
+//         real mxxyMyzz = mfcba - mfabc;
+
+//         real mxxzPyyz = mfcab + mfacb;
+//         real mxxzMyyz = mfcab - mfacb;
+
+//         real mxyyPxzz = mfbca + mfbac;
+//         real mxyyMxzz = mfbca - mfbac;
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         // incl. correction
+//         ////////////////////////////////////////////////////////////
+//         //! - Compute velocity  gradients from second order cumulants according to Eq. (27)-(32)
+//         //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+//         //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a> Further explanations of the correction in viscosity in Appendix H of
+//         //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+//         //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> Note that the division by rho is omitted here as we need rho times
+//         //! the gradients later.
+//         //!
+//         real Dxy  = -c3o1 * omega * mfbba;
+//         real Dxz  = -c3o1 * omega * mfbab;
+//         real Dyz  = -c3o1 * omega * mfabb;
+//         real dxux = c1o2 * (-omega) * (mxxMyy + mxxMzz) + c1o2 * OxxPyyPzz * (mfaaa - mxxPyyPzz);
+//         real dyuy = dxux + omega * c3o2 * mxxMyy;
+//         real dzuz = dxux + omega * c3o2 * mxxMzz;
+
+//         //Smagorinsky for debugging
+//         // if(true)
+//         // {   
+//             // if(false && k==99976)
+//             // {
+//             //     printf("dudz+dwdu: \t %1.14f \n", Dxz );
+//             //     printf("dvdz+dudy: \t %1.14f \n", Dxy );  
+//             //     printf("dwdy+dvdz: \t %1.14f \n", Dyz );  
+//             //     printf("nu_t * dudz+dwdu: \t %1.14f \n", turbulentViscosity[k]*Dxz );
+//             //     printf("nu_t * dvdz+dudy: \t %1.14f \n", turbulentViscosity[k]*Dxy );  
+//             //     printf("nu_t * dwdy+dvdz: \t %1.14f \n", turbulentViscosity[k]*Dyz );      
+//             // } 
+//         //     real Sbar = sqrt(c2o1*(dxux*dxux+dyuy*dyuy+dzuz*dzuz)+Dxy*Dxy+Dxz*Dxz+Dyz*Dyz);
+//         //     real Cs = 0.08f;
+//         //     turbulentViscosity[k] = Cs*Cs*Sbar;
+//         // }
+
+//         ////////////////////////////////////////////////////////////
+//         //! - Relaxation of second order cumulants with correction terms according to Eq. (33)-(35) in
+//         //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+//         //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//         //!
+//         mxxPyyPzz +=
+//             OxxPyyPzz * (mfaaa - mxxPyyPzz) - c3o1 * (c1o1 - c1o2 * OxxPyyPzz) * (vx2 * dxux + vy2 * dyuy + vz2 * dzuz);
+//         mxxMyy += omega * (-mxxMyy) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vy2 * dyuy);
+//         mxxMzz += omega * (-mxxMzz) - c3o1 * (c1o1 + c1o2 * (-omega)) * (vx2 * dxux - vz2 * dzuz);
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         ////no correction
+//         // mxxPyyPzz += OxxPyyPzz*(mfaaa - mxxPyyPzz);
+//         // mxxMyy += -(-omega) * (-mxxMyy);
+//         // mxxMzz += -(-omega) * (-mxxMzz);
+//         //////////////////////////////////////////////////////////////////////////
+//         mfabb += omega * (-mfabb);
+//         mfbab += omega * (-mfbab);
+//         mfbba += omega * (-mfbba);
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         // relax
+//         //////////////////////////////////////////////////////////////////////////
+//         // incl. limiter
+//         //! - Relaxation of third order cumulants including limiter according to Eq. (116)-(123)
+//         //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+//         //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//         //!
+//         wadjust = Oxyz + (c1o1 - Oxyz) * abs(mfbbb) / (abs(mfbbb) + qudricLimitD);
+//         mfbbb += wadjust * (-mfbbb);
+//         wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxxyPyzz) / (abs(mxxyPyzz) + qudricLimitP);
+//         mxxyPyzz += wadjust * (-mxxyPyzz);
+//         wadjust = OxyyMxzz + (c1o1 - OxyyMxzz) * abs(mxxyMyzz) / (abs(mxxyMyzz) + qudricLimitM);
+//         mxxyMyzz += wadjust * (-mxxyMyzz);
+//         wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxxzPyyz) / (abs(mxxzPyyz) + qudricLimitP);
+//         mxxzPyyz += wadjust * (-mxxzPyyz);
+//         wadjust = OxyyMxzz + (c1o1 - OxyyMxzz) * abs(mxxzMyyz) / (abs(mxxzMyyz) + qudricLimitM);
+//         mxxzMyyz += wadjust * (-mxxzMyyz);
+//         wadjust = OxyyPxzz + (c1o1 - OxyyPxzz) * abs(mxyyPxzz) / (abs(mxyyPxzz) + qudricLimitP);
+//         mxyyPxzz += wadjust * (-mxyyPxzz);
+//         wadjust = OxyyMxzz + (c1o1 - OxyyMxzz) * abs(mxyyMxzz) / (abs(mxyyMxzz) + qudricLimitM);
+//         mxyyMxzz += wadjust * (-mxyyMxzz);
+//         //////////////////////////////////////////////////////////////////////////
+//         // no limiter
+//         // mfbbb += OxyyMxzz * (-mfbbb);
+//         // mxxyPyzz += OxyyPxzz * (-mxxyPyzz);
+//         // mxxyMyzz += OxyyMxzz * (-mxxyMyzz);
+//         // mxxzPyyz += OxyyPxzz * (-mxxzPyyz);
+//         // mxxzMyyz += OxyyMxzz * (-mxxzMyyz);
+//         // mxyyPxzz += OxyyPxzz * (-mxyyPxzz);
+//         // mxyyMxzz += OxyyMxzz * (-mxyyMxzz);
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - Compute inverse linear combinations of second and third order cumulants
+//         //!
+//         mfcaa = c1o3 * (mxxMyy + mxxMzz + mxxPyyPzz);
+//         mfaca = c1o3 * (-c2o1 * mxxMyy + mxxMzz + mxxPyyPzz);
+//         mfaac = c1o3 * (mxxMyy - c2o1 * mxxMzz + mxxPyyPzz);
+
+//         mfcba = (mxxyMyzz + mxxyPyzz) * c1o2;
+//         mfabc = (-mxxyMyzz + mxxyPyzz) * c1o2;
+//         mfcab = (mxxzMyyz + mxxzPyyz) * c1o2;
+//         mfacb = (-mxxzMyyz + mxxzPyyz) * c1o2;
+//         mfbca = (mxyyMxzz + mxyyPxzz) * c1o2;
+//         mfbac = (-mxyyMxzz + mxyyPxzz) * c1o2;
+//         //////////////////////////////////////////////////////////////////////////
+
+//         //////////////////////////////////////////////////////////////////////////
+//         // 4.
+//         // no limiter
+//         //! - Relax fourth order cumulants to modified equilibrium for fourth order convergence of diffusion according
+//         //! to Eq. (43)-(48) <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+//         //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//         //!
+//         CUMacc = -O4 * (c1o1 / omega - c1o2) * (dyuy + dzuz) * c2o3 * A + (c1o1 - O4) * (CUMacc);
+//         CUMcac = -O4 * (c1o1 / omega - c1o2) * (dxux + dzuz) * c2o3 * A + (c1o1 - O4) * (CUMcac);
+//         CUMcca = -O4 * (c1o1 / omega - c1o2) * (dyuy + dxux) * c2o3 * A + (c1o1 - O4) * (CUMcca);
+//         CUMbbc = -O4 * (c1o1 / omega - c1o2) * Dxy * c1o3 * B + (c1o1 - O4) * (CUMbbc);
+//         CUMbcb = -O4 * (c1o1 / omega - c1o2) * Dxz * c1o3 * B + (c1o1 - O4) * (CUMbcb);
+//         CUMcbb = -O4 * (c1o1 / omega - c1o2) * Dyz * c1o3 * B + (c1o1 - O4) * (CUMcbb);
+
+//         //////////////////////////////////////////////////////////////////////////
+//         // 5.
+//         CUMbcc += O5 * (-CUMbcc);
+//         CUMcbc += O5 * (-CUMcbc);
+//         CUMccb += O5 * (-CUMccb);
+
+//         //////////////////////////////////////////////////////////////////////////
+//         // 6.
+//         CUMccc += O6 * (-CUMccc);
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - Compute central moments from post collision cumulants according to Eq. (53)-(56) in
+//         //! <a href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017),
+//         //! DOI:10.1016/j.jcp.2017.05.040 ]</b></a>
+//         //!
+
+//         //////////////////////////////////////////////////////////////////////////
+//         // 4.
+//         mfcbb = CUMcbb + c1o3 * ((c3o1 * mfcaa + c1o1) * mfabb + c6o1 * mfbba * mfbab) * OOrho;
+//         mfbcb = CUMbcb + c1o3 * ((c3o1 * mfaca + c1o1) * mfbab + c6o1 * mfbba * mfabb) * OOrho;
+//         mfbbc = CUMbbc + c1o3 * ((c3o1 * mfaac + c1o1) * mfbba + c6o1 * mfbab * mfabb) * OOrho;
+
+//         mfcca =
+//             CUMcca +
+//             (((mfcaa * mfaca + c2o1 * mfbba * mfbba) * c9o1 + c3o1 * (mfcaa + mfaca)) * OOrho - (drho * OOrho)) * c1o9;
+//         mfcac =
+//             CUMcac +
+//             (((mfcaa * mfaac + c2o1 * mfbab * mfbab) * c9o1 + c3o1 * (mfcaa + mfaac)) * OOrho - (drho * OOrho)) * c1o9;
+//         mfacc =
+//             CUMacc +
+//             (((mfaac * mfaca + c2o1 * mfabb * mfabb) * c9o1 + c3o1 * (mfaac + mfaca)) * OOrho - (drho * OOrho)) * c1o9;
+
+//         //////////////////////////////////////////////////////////////////////////
+//         // 5.
+//         mfbcc = CUMbcc + c1o3 *
+//                              (c3o1 * (mfaac * mfbca + mfaca * mfbac + c4o1 * mfabb * mfbbb +
+//                                       c2o1 * (mfbab * mfacb + mfbba * mfabc)) +
+//                               (mfbca + mfbac)) *
+//                              OOrho;
+//         mfcbc = CUMcbc + c1o3 *
+//                              (c3o1 * (mfaac * mfcba + mfcaa * mfabc + c4o1 * mfbab * mfbbb +
+//                                       c2o1 * (mfabb * mfcab + mfbba * mfbac)) +
+//                               (mfcba + mfabc)) *
+//                              OOrho;
+//         mfccb = CUMccb + c1o3 *
+//                              (c3o1 * (mfcaa * mfacb + mfaca * mfcab + c4o1 * mfbba * mfbbb +
+//                                       c2o1 * (mfbab * mfbca + mfabb * mfcba)) +
+//                               (mfacb + mfcab)) *
+//                              OOrho;
+
+//         //////////////////////////////////////////////////////////////////////////
+//         // 6.
+//         mfccc = CUMccc - ((-c4o1 * mfbbb * mfbbb - (mfcaa * mfacc + mfaca * mfcac + mfaac * mfcca) -
+//                            c4o1 * (mfabb * mfcbb + mfbab * mfbcb + mfbba * mfbbc) -
+//                            c2o1 * (mfbca * mfbac + mfcba * mfabc + mfcab * mfacb)) *
+//                               OOrho +
+//                           (c4o1 * (mfbab * mfbab * mfaca + mfabb * mfabb * mfcaa + mfbba * mfbba * mfaac) +
+//                            c2o1 * (mfcaa * mfaca * mfaac) + c16o1 * mfbba * mfbab * mfabb) *
+//                               OOrho * OOrho -
+//                           c1o3 * (mfacc + mfcac + mfcca) * OOrho - c1o9 * (mfcaa + mfaca + mfaac) * OOrho +
+//                           (c2o1 * (mfbab * mfbab + mfabb * mfabb + mfbba * mfbba) +
+//                            (mfaac * mfaca + mfaac * mfcaa + mfaca * mfcaa) + c1o3 * (mfaac + mfaca + mfcaa)) *
+//                               OOrho * OOrho * c2o3 +
+//                           c1o27 * ((drho * drho - drho) * OOrho * OOrho));
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! -  Add acceleration (body force) to first order cumulants according to Eq. (85)-(87) in
+//         //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+//         //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a>
+//         //!
+//         mfbaa = -mfbaa;
+//         mfaba = -mfaba;
+//         mfaab = -mfaab;
+
+
+//         //Write to array here to distribute read/write
+//         rho[k] = drho;
+//         vx[k] = vvx;
+//         vy[k] = vvy;
+//         vz[k] = vvz;
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - Chimera transform from central moments to well conditioned distributions as defined in Appendix J in
+//         //! <a href="https://doi.org/10.1016/j.camwa.2015.05.001"><b>[ M. Geier et al. (2015),
+//         //! DOI:10.1016/j.camwa.2015.05.001 ]</b></a> see also Eq. (88)-(96) in <a
+//         //! href="https://doi.org/10.1016/j.jcp.2017.05.040"><b>[ M. Geier et al. (2017), DOI:10.1016/j.jcp.2017.05.040
+//         //! ]</b></a>
+//         //!
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         // X - Dir
+//         backwardInverseChimeraWithK(mfaaa, mfbaa, mfcaa, vvx, vx2, c1o1, c1o1);
+//         backwardChimera(mfaba, mfbba, mfcba, vvx, vx2);
+//         backwardInverseChimeraWithK(mfaca, mfbca, mfcca, vvx, vx2, c3o1, c1o3);
+//         backwardChimera(mfaab, mfbab, mfcab, vvx, vx2);
+//         backwardChimera(mfabb, mfbbb, mfcbb, vvx, vx2);
+//         backwardChimera(mfacb, mfbcb, mfccb, vvx, vx2);
+//         backwardInverseChimeraWithK(mfaac, mfbac, mfcac, vvx, vx2, c3o1, c1o3);
+//         backwardChimera(mfabc, mfbbc, mfcbc, vvx, vx2);
+//         backwardInverseChimeraWithK(mfacc, mfbcc, mfccc, vvx, vx2, c9o1, c1o9);
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         // Y - Dir
+//         backwardInverseChimeraWithK(mfaaa, mfaba, mfaca, vvy, vy2, c6o1, c1o6);
+//         backwardChimera(mfaab, mfabb, mfacb, vvy, vy2);
+//         backwardInverseChimeraWithK(mfaac, mfabc, mfacc, vvy, vy2, c18o1, c1o18);
+//         backwardInverseChimeraWithK(mfbaa, mfbba, mfbca, vvy, vy2, c3o2, c2o3);
+//         backwardChimera(mfbab, mfbbb, mfbcb, vvy, vy2);
+//         backwardInverseChimeraWithK(mfbac, mfbbc, mfbcc, vvy, vy2, c9o2, c2o9);
+//         backwardInverseChimeraWithK(mfcaa, mfcba, mfcca, vvy, vy2, c6o1, c1o6);
+//         backwardChimera(mfcab, mfcbb, mfccb, vvy, vy2);
+//         backwardInverseChimeraWithK(mfcac, mfcbc, mfccc, vvy, vy2, c18o1, c1o18);
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         // Z - Dir
+//         backwardInverseChimeraWithK(mfaaa, mfaab, mfaac, vvz, vz2, c36o1, c1o36);
+//         backwardInverseChimeraWithK(mfaba, mfabb, mfabc, vvz, vz2, c9o1, c1o9);
+//         backwardInverseChimeraWithK(mfaca, mfacb, mfacc, vvz, vz2, c36o1, c1o36);
+//         backwardInverseChimeraWithK(mfbaa, mfbab, mfbac, vvz, vz2, c9o1, c1o9);
+//         backwardInverseChimeraWithK(mfbba, mfbbb, mfbbc, vvz, vz2, c9o4, c4o9);
+//         backwardInverseChimeraWithK(mfbca, mfbcb, mfbcc, vvz, vz2, c9o1, c1o9);
+//         backwardInverseChimeraWithK(mfcaa, mfcab, mfcac, vvz, vz2, c36o1, c1o36);
+//         backwardInverseChimeraWithK(mfcba, mfcbb, mfcbc, vvz, vz2, c9o1, c1o9);
+//         backwardInverseChimeraWithK(mfcca, mfccb, mfccc, vvz, vz2, c36o1, c1o36);
+
+//         ////////////////////////////////////////////////////////////////////////////////////
+//         //! - Write distributions: style of reading and writing the distributions from/to
+//         //! stored arrays dependent on timestep is based on the esoteric twist algorithm
+//         //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
+//         //! DOI:10.3390/computation5020019 ]</b></a>
+//         //!
+
+
+//         distr_wrapper.distribution.f[dirE]      = mfabb;
+//         distr_wrapper.distribution.f[dirW]      = mfcbb;
+//         distr_wrapper.distribution.f[dirN]      = mfbab;
+//         distr_wrapper.distribution.f[dirS]      = mfbcb;
+//         distr_wrapper.distribution.f[dirT]      = mfbba;
+//         distr_wrapper.distribution.f[dirB]      = mfbbc;
+//         distr_wrapper.distribution.f[dirNE]     = mfaab;
+//         distr_wrapper.distribution.f[dirSW]     = mfccb;
+//         distr_wrapper.distribution.f[dirSE]     = mfacb;
+//         distr_wrapper.distribution.f[dirNW]     = mfcab;
+//         distr_wrapper.distribution.f[dirTE]     = mfaba;
+//         distr_wrapper.distribution.f[dirBW]     = mfcbc;
+//         distr_wrapper.distribution.f[dirBE]     = mfabc;
+//         distr_wrapper.distribution.f[dirTW]     = mfcba;
+//         distr_wrapper.distribution.f[dirTN]     = mfbaa;
+//         distr_wrapper.distribution.f[dirBS]     = mfbcc;
+//         distr_wrapper.distribution.f[dirBN]     = mfbac;
+//         distr_wrapper.distribution.f[dirTS]     = mfbca;
+//         distr_wrapper.distribution.f[dirZERO]   = mfbbb;
+//         distr_wrapper.distribution.f[dirTNE]    = mfaaa;
+//         distr_wrapper.distribution.f[dirTSW]    = mfaca;
+//         distr_wrapper.distribution.f[dirTSE]    = mfaac;
+//         distr_wrapper.distribution.f[dirTNW]    = mfacc;
+//         distr_wrapper.distribution.f[dirBNE]    = mfcaa;
+//         distr_wrapper.distribution.f[dirBSW]    = mfcca;
+//         distr_wrapper.distribution.f[dirBSE]    = mfcac;
+//         distr_wrapper.distribution.f[dirBNW]    = mfccc;
+
+//         distr_wrapper.write();
+//         if(k==100000)
+//         {
+//             printf("mfcbb \t %f \t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f \n\n", 
+//                                                 (dist.f[dirE])[k]                  ,        
+//                                                 (dist.f[dirN])[k]       ,
+//                                                 (dist.f[dirS])[ks]      ,
+//                                                 (dist.f[dirT])[k]       ,
+//                                                 (dist.f[dirB])[kb]      ,
+//                                                 (dist.f[dirNE])[k]      ,
+//                                                 (dist.f[dirSW])[ksw]    ,
+//                                                 (dist.f[dirSE])[ks]     ,
+//                                                 (dist.f[dirNW])[kw]   ,
+//                                                 (dist.f[dirW])[kw]    ,
+//                                                 (dist.f[dirTE])[k]    ,
+//                                                 (dist.f[dirBW])[kbw]  ,
+//                                                 (dist.f[dirBE])[kb]   ,
+//                                                 (dist.f[dirTW])[kw]   ,
+//                                                 (dist.f[dirTN])[k]    ,
+//                                                 (dist.f[dirBS])[kbs]  ,
+//                                                 (dist.f[dirBN])[kb]   ,
+//                                                 (dist.f[dirTS])[ks]   ,
+//                                                 (dist.f[dirZERO])[k]  ,
+//                                                 (dist.f[dirTNE])[k]   ,
+//                                                 (dist.f[dirTSE])[ks]  ,
+//                                                 (dist.f[dirBNE])[kb]  ,
+//                                                 (dist.f[dirBSE])[kbs] ,
+//                                                 (dist.f[dirTNW])[kw]  ,
+//                                                 (dist.f[dirTSW])[ksw] ,
+//                                                 (dist.f[dirBNW])[kbw] ,
+//                                                 (dist.f[dirBSW])[kbsw]);
+//         }
+
+//         (dist.f[dirE])[k]      = mfabb;
+//         (dist.f[dirW])[kw]     = mfcbb;
+//         (dist.f[dirN])[k]      = mfbab;
+//         (dist.f[dirS])[ks]     = mfbcb;
+//         (dist.f[dirT])[k]      = mfbba;
+//         (dist.f[dirB])[kb]     = mfbbc;
+//         (dist.f[dirNE])[k]     = mfaab;
+//         (dist.f[dirSW])[ksw]   = mfccb;
+//         (dist.f[dirSE])[ks]    = mfacb;
+//         (dist.f[dirNW])[kw]    = mfcab;
+//         (dist.f[dirTE])[k]     = mfaba;
+//         (dist.f[dirBW])[kbw]   = mfcbc;
+//         (dist.f[dirBE])[kb]    = mfabc;
+//         (dist.f[dirTW])[kw]    = mfcba;
+//         (dist.f[dirTN])[k]     = mfbaa;
+//         (dist.f[dirBS])[kbs]   = mfbcc;
+//         (dist.f[dirBN])[kb]    = mfbac;
+//         (dist.f[dirTS])[ks]    = mfbca;
+//         (dist.f[dirZERO])[k]   = mfbbb;
+//         (dist.f[dirTNE])[k]    = mfaaa;
+//         (dist.f[dirTSE])[ks]   = mfaca;
+//         (dist.f[dirBNE])[kb]   = mfaac;
+//         (dist.f[dirBSE])[kbs]  = mfacc;
+//         (dist.f[dirTNW])[kw]   = mfcaa;
+//         (dist.f[dirTSW])[ksw]  = mfcca;
+//         (dist.f[dirBNW])[kbw]  = mfcac;
+//         (dist.f[dirBSW])[kbsw] = mfccc;
+        
+//         if(k==100000)
+//         {
+//             printf("mfcbb \t %f \t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f\t %f \n\n\n", 
+//                                                 (dist.f[dirE])[k]                  ,        
+//                                                 (dist.f[dirN])[k]       ,
+//                                                 (dist.f[dirS])[ks]      ,
+//                                                 (dist.f[dirT])[k]       ,
+//                                                 (dist.f[dirB])[kb]      ,
+//                                                 (dist.f[dirNE])[k]      ,
+//                                                 (dist.f[dirSW])[ksw]    ,
+//                                                 (dist.f[dirSE])[ks]     ,
+//                                                 (dist.f[dirNW])[kw]   ,
+//                                                 (dist.f[dirW])[kw]   ,
+//                                                 (dist.f[dirTE])[k]    ,
+//                                                 (dist.f[dirBW])[kbw]  ,
+//                                                 (dist.f[dirBE])[kb]   ,
+//                                                 (dist.f[dirTW])[kw]   ,
+//                                                 (dist.f[dirTN])[k]    ,
+//                                                 (dist.f[dirBS])[kbs]  ,
+//                                                 (dist.f[dirBN])[kb]   ,
+//                                                 (dist.f[dirTS])[ks]   ,
+//                                                 (dist.f[dirZERO])[k]  ,
+//                                                 (dist.f[dirTNE])[k]   ,
+//                                                 (dist.f[dirTSE])[ks]  ,
+//                                                 (dist.f[dirBNE])[kb]  ,
+//                                                 (dist.f[dirBSE])[kbs] ,
+//                                                 (dist.f[dirTNW])[kw]  ,
+//                                                 (dist.f[dirTSW])[ksw] ,
+//                                                 (dist.f[dirBNW])[kbw] ,
+//                                                 (dist.f[dirBSW])[kbsw]);
+//         }
+//     }
+// }
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/LBM/LB.h b/src/gpu/VirtualFluids_GPU/LBM/LB.h
index 7424c473e7482ce1ad997a6241bbc1749e4a668f..17404fb959849663130c7de2a86764f198b9c32e 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/LB.h
+++ b/src/gpu/VirtualFluids_GPU/LBM/LB.h
@@ -126,6 +126,7 @@ struct InitCondition
    bool isMeasurePoints {false};
    bool isInitNeq {false};
    bool isGeoNormal, isInflowNormal, isOutflowNormal;
+   bool hasWallModelMonitor {false};
    bool simulatePorousMedia {false};
    bool streetVelocityFile {false};
 };
@@ -186,7 +187,10 @@ typedef struct QforBC{
    real* q19[19];
    int kQ=0;
    int kArray;
-   real *Vx, *Vy, *Vz, *deltaVz, *RhoBC;
+   real *Vx,      *Vy,      *Vz;
+   real *Vx1,     *Vy1,     *Vz1;
+   real *deltaVz, *RhoBC;
+   real *normalX, *normalY, *normalZ;
 }QforBoundaryConditions;
 
 //BCTemp
@@ -213,6 +217,17 @@ typedef struct TempPressforBC{
    int kTemp=0;
 }TempPressforBoundaryConditions;
 
+// Settings for wall model used in StressBC
+typedef struct WMparas{
+   real* z0;
+   int* samplingOffset;
+   bool hasMonitor;
+   real* u_star;
+   real* Fx;
+   real* Fy;
+   real* Fz;
+}WallModelParameters;
+
 //measurePoints
 typedef struct MeasP{
 	std::string name;
diff --git a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
index 3c845e24963061f4c61f50cd0084771bce9afae9..a16566ecb734f8c8c5d1e7d5f01df884e6464f6a 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
+++ b/src/gpu/VirtualFluids_GPU/LBM/Simulation.cpp
@@ -37,6 +37,8 @@
 #include "Calculation/ForceCalculations.h"
 #include "Calculation/PorousMedia.h"
 //////////////////////////////////////////////////////////////////////////
+#include "Output/Timer.h"
+//////////////////////////////////////////////////////////////////////////
 #include "Restart/RestartObject.h"
 //////////////////////////////////////////////////////////////////////////
 #include "DataStructureInitializer/GridProvider.h"
@@ -126,6 +128,11 @@ void Simulation::init(SPtr<Parameter> para, SPtr<GridProvider> gridProvider, std
    output << "vis_ratio:  "   << para->getViscosityRatio() << "\n";
    output << "u0_ratio:   "   << para->getVelocityRatio()  << "\n";
    output << "delta_rho:  "   << para->getDensityRatio()   << "\n";
+   output << "QuadricLimiters:  "   << para->getQuadricLimitersHost()[0] << "\t"
+   									<< para->getQuadricLimitersHost()[1] << "\t"
+									<< para->getQuadricLimitersHost()[2] << "\n";
+   if(para->getUseAMD())
+		output << "AMD SGS model:  "   << para->getSGSConstant()   << "\n";
    //////////////////////////////////////////////////////////////////////////
 
    /////////////////////////////////////////////////////////////////////////
@@ -379,14 +386,9 @@ void Simulation::bulk()
 
 void Simulation::run()
 {
-   double ftimeE, ftimeS, fnups, durchsatz;
-   float timerE, timerS;
-   timerE   = 0.0f;
-   timerS   = 0.0f;
-   ftimeE   = 0.0f;
-   ftimeS   = 0.0f;
    unsigned int t, t_prev;
    unsigned int t_MP = 0;
+
    //////////////////////////////////////////////////////////////////////////
    para->setStepEnsight(0);
 
@@ -404,28 +406,21 @@ void Simulation::run()
    }
    //////////////////////////////////////////////////////////////////////////
 
-   //Timer SDK
-   StopWatchInterface *sdkTimer = NULL;
-   sdkCreateTimer(&sdkTimer);
-   sdkStartTimer(&sdkTimer);
-   //Timer Event
-   cudaEvent_t start_t, stop_t;
-   checkCudaErrors( cudaEventCreate(&start_t));
-   checkCudaErrors( cudaEventCreate(&stop_t));
-   checkCudaErrors( cudaEventRecord(start_t));
-
    t_prev = para->getTimeCalcMedStart();
 
-   output << "Processing time (ms) \t Nups in Mio \t Durchsatz in GB/sec\n";
+	output << "getMaxLevel = " << para->getMaxLevel() << "\n";
+
+	Timer* averageTimer = new Timer("Average performance");
+	averageTimer->startTimer();
 
-   output << "getMaxLevel = " << para->getMaxLevel() << "\n";
 	////////////////////////////////////////////////////////////////////////////////
 	// Time loop
 	////////////////////////////////////////////////////////////////////////////////
 	for(t=para->getTStart();t<=para->getTEnd();t++)
 	{
+		
         updateGrid27(para.get(), communicator, cudaManager.get(), pm, 0, t, kernels);
-
+		
 	    ////////////////////////////////////////////////////////////////////////////////
 	    //Particles
 	    ////////////////////////////////////////////////////////////////////////////////
@@ -497,14 +492,8 @@ void Simulation::run()
         ////////////////////////////////////////////////////////////////////////////////
         if(para->getDoCheckPoint() && para->getTimeDoCheckPoint()>0 && t%para->getTimeDoCheckPoint()==0 && t>0 && !para->overWritingRestart(t))
         {
+			averageTimer->stopTimer();
             //////////////////////////////////////////////////////////////////////////
-            //Timer SDK
-            sdkStopTimer(&sdkTimer);
-            sdkResetTimer(&sdkTimer);
-            //////////////////////////////////////////////////////////////////////////
-            //Timer Event
-            checkCudaErrors( cudaEventRecord(stop_t));
-            checkCudaErrors( cudaEventSynchronize(stop_t));
             
             if( para->getDoCheckPoint() )
             {
@@ -523,11 +512,7 @@ void Simulation::run()
                 output << "\n fertig\n";
             }
             //////////////////////////////////////////////////////////////////////////
-            //Timer SDK
-            sdkStartTimer(&sdkTimer);
-            //////////////////////////////////////////////////////////////////////////
-            //Timer Event
-            checkCudaErrors( cudaEventRecord(start_t));
+			averageTimer->startTimer();
         }
         //////////////////////////////////////////////////////////////////////////////
 
@@ -641,36 +626,10 @@ void Simulation::run()
 		  //else                                    para->getParD(0)->evenOrOdd=true;
 		  //////////////////////////////////////////////////////////////////////////////////
 
-		  
-		 //////////////////////////////////////////////////////////////////////////
-		 //Timer SDK
-		 checkCudaErrors(cudaDeviceSynchronize());
-		 sdkStopTimer(&sdkTimer);
-		 timerS = sdkGetTimerValue(&sdkTimer);
-		 sdkResetTimer(&sdkTimer);
-		 ftimeS += timerS;
-		 fnups = 0.0;
-		 durchsatz = 0.0;
-		 for (int lev=para->getCoarse(); lev <= para->getFine(); lev++)
-		 {
-			 fnups += 1000.0 * (t-para->getTStart()) * para->getParH(lev)->size_Mat_SP * pow(2.,lev) / (ftimeS*1.0E6);
-			 durchsatz  +=  (27.0+1.0) * 4.0 * 1000.0 * (t-para->getTStart()) * para->getParH(lev)->size_Mat_SP  / (ftimeS*1.0E9);
-		 }
-		 output << timerS << " / " << ftimeS << " \t " <<  fnups << " \t " << durchsatz << "\n";
-         //////////////////////////////////////////////////////////////////////////
-		 //Timer Event
-		 checkCudaErrors( cudaEventRecord(stop_t));
-         checkCudaErrors( cudaEventSynchronize(stop_t));
-         checkCudaErrors( cudaEventElapsedTime( &timerE, start_t, stop_t));
-         ftimeE += timerE;
-         fnups = 0.0;
-         durchsatz = 0.0;
-         for (int lev=para->getCoarse(); lev <= para->getFine(); lev++)
-         {
-            fnups += 1000.0 * (t-para->getTStart()) * para->getParH(lev)->size_Mat_SP * pow(2.,lev) / (ftimeE*1.0E6);
-            durchsatz  +=  (27.0+1.0) * 4.0 * 1000.0 * (t-para->getTStart()) * para->getParH(lev)->size_Mat_SP  / (ftimeE*1.0E9);
-         }
-         output << timerE << " / " << ftimeE << " \t " <<  fnups << " \t " << durchsatz << "\n";
+		//////////////////////////////////////////////////////////////////////////
+		averageTimer->stopTimer();
+		averageTimer->outputPerformance(t, para.get());
+		//////////////////////////////////////////////////////////////////////////
 
          if( para->getPrintFiles() )
          {
@@ -942,44 +901,11 @@ void Simulation::run()
 			output << "done.\n";
 			////////////////////////////////////////////////////////////////////////
          }
-		 sdkStartTimer(&sdkTimer);
-         checkCudaErrors( cudaEventRecord(start_t));
-      }
-	}
-
 
-	//////////////////////////////////////////////////////////////////////////
-	//Timer SDK
-	sdkStopTimer(&sdkTimer);
-	timerS = sdkGetTimerValue(&sdkTimer);
-	ftimeS += timerS;
-	fnups = 0.0;
-	durchsatz = 0.0;
-	for (int lev=para->getCoarse(); lev <= para->getFine(); lev++)
-	{
-		fnups += 1000.0 * (t-para->getTStart()) * para->getParH(lev)->size_Mat_SP * pow(2.,lev) / (ftimeS*1.0E6);
-		durchsatz  +=  (27.0+1.0) * 4.0 * 1000.0 * (t-para->getTStart()) * para->getParH(lev)->size_Mat_SP / (ftimeS*1.0E9);
+		////////////////////////////////////////////////////////////////////////
+		averageTimer->startTimer();
+      }
 	}
-	output << "Processing time: " << ftimeS << "(ms)\n";
-	output << "Nups in Mio: " << fnups << "\n";
-	output << "Durchsatz in GB/sec: " << durchsatz << "\n";
-    //////////////////////////////////////////////////////////////////////////
-	//Timer Event
-    checkCudaErrors( cudaEventRecord(stop_t));
-    checkCudaErrors( cudaEventSynchronize(stop_t));
-    checkCudaErrors( cudaEventElapsedTime( &timerE, start_t, stop_t ));
-    ftimeE += timerE;
-    fnups = 0.0;
-    durchsatz = 0.0;
-    for (int lev=para->getCoarse(); lev <= para->getFine(); lev++)
-    {
-       fnups += 1000.0 * (t-para->getTStart()) * para->getParH(lev)->size_Mat_SP * pow(2.,lev) / (ftimeE*1.0E6);
-       durchsatz  +=  (27.0+1.0) * 4.0 * 1000.0 * (t-para->getTStart()) * para->getParH(lev)->size_Mat_SP / (ftimeE*1.0E9);
-    }
-    output << "Processing time: " << ftimeE << "(ms)\n";
-    output << "Nups in Mio: " << fnups << "\n";
-    output << "Durchsatz in GB/sec: " << durchsatz << "\n";
-	//////////////////////////////////////////////////////////////////////////
 
 	////////////////////////////////////////////////////////////////////////////////
 	//printDragLift(para);
@@ -1019,11 +945,7 @@ void Simulation::run()
 	//		MeasurePointWriter::writeMeasurePoints(para, lev, j, 0);
 	//	}
 	//}                                                  
- //  //////////////////////////////////////////////////////////////////////////
-
-	checkCudaErrors(cudaEventDestroy(start_t));
-	checkCudaErrors(cudaEventDestroy(stop_t));
-	sdkDeleteTimer(&sdkTimer);   
+ //  //////////////////////////////////////////////////////////////////////////  
 }
 
 void Simulation::porousMedia()
diff --git a/src/gpu/VirtualFluids_GPU/Output/TimeStepTimer.cpp b/src/gpu/VirtualFluids_GPU/Output/TimeStepTimer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..78da4947d0c8196cda49fef754a3f44fc39d0a44
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Output/TimeStepTimer.cpp
@@ -0,0 +1,64 @@
+#include "helper_cuda.h"
+#include <cuda_runtime.h>
+#include "Core/DataTypes.h"
+#include "UbScheduler.h"
+#include "Parameter/Parameter.h"
+
+#include "Timer.h"
+#include "TimeStepTimer.h"
+
+void TimeStepTimer::startTotalTimer             (uint t){ if(t%this->tActivate==0) this->totalTimer->startTimer();              }
+void TimeStepTimer::stopTotalTimer              (uint t){ if(t%this->tActivate==0) this->totalTimer->stopTimer();               }
+void TimeStepTimer::startCollisionTimer         (uint t){ if(t%this->tActivate==0) this->collisionTimer->startTimer();          }
+void TimeStepTimer::stopCollisionTimer          (uint t){ if(t%this->tActivate==0) this->collisionTimer->stopTimer();           }
+void TimeStepTimer::startPostCollisionBCTimer   (uint t){ if(t%this->tActivate==0) this->postCollisionBCTimer->startTimer();    }
+void TimeStepTimer::stopPostCollisionBCTimer    (uint t){ if(t%this->tActivate==0) this->postCollisionBCTimer->stopTimer();     }
+void TimeStepTimer::startPreCollisionBCTimer    (uint t){ if(t%this->tActivate==0) this->preCollisionBCTimer->startTimer();     }
+void TimeStepTimer::stopPreCollisionBCTimer     (uint t){ if(t%this->tActivate==0) this->preCollisionBCTimer->stopTimer();      }
+void TimeStepTimer::startEddyViscosityTimer     (uint t){ if(t%this->tActivate==0) this->eddyViscosityTimer->startTimer();      }
+void TimeStepTimer::stopEddyViscosityTimer      (uint t){ if(t%this->tActivate==0) this->eddyViscosityTimer->stopTimer();       }
+void TimeStepTimer::startActuatorTimer          (uint t){ if(t%this->tActivate==0) this->actuatorTimer->startTimer();           }
+void TimeStepTimer::stopActuatorTimer           (uint t){ if(t%this->tActivate==0) this->actuatorTimer->stopTimer();            }
+void TimeStepTimer::startProbeTimer             (uint t){ if(t%this->tActivate==0) this->probeTimer->startTimer();              }
+void TimeStepTimer::stopProbeTimer              (uint t){ if(t%this->tActivate==0) this->probeTimer->stopTimer();               }
+void TimeStepTimer::startExchangeTimer          (uint t){ if(t%this->tActivate==0) this->exchangeTimer->startTimer();           }
+void TimeStepTimer::stopExchangeTimer           (uint t){ if(t%this->tActivate==0) this->exchangeTimer->stopTimer();            }
+
+
+void TimeStepTimer::resetTimers(uint t)
+{
+    if(t%this->tActivate==0)
+    {
+        this->totalTimer->resetTimer();
+        this->collisionTimer->resetTimer();
+        this->postCollisionBCTimer->resetTimer();
+        this->preCollisionBCTimer->resetTimer();
+        this->eddyViscosityTimer->resetTimer();
+        this->actuatorTimer->resetTimer();
+        this->probeTimer->resetTimer();
+    }
+}
+
+void TimeStepTimer::outputPerformance(uint t, Parameter* para)
+{
+    if(t%this->tActivate==0)
+    {
+        
+        float tCollision         = this->collisionTimer->getTotalElapsedTime();
+        float tPostCollisionBC   = this->postCollisionBCTimer->getTotalElapsedTime();
+        float tPreCollisionBC    = this->preCollisionBCTimer->getTotalElapsedTime();
+        float tEddyViscosity     = this->eddyViscosityTimer->getTotalElapsedTime();
+        float tAcutator          = this->actuatorTimer->getTotalElapsedTime();
+        float tProbe             = this->probeTimer->getTotalElapsedTime();
+        float tExchange          = this->exchangeTimer->getTotalElapsedTime();
+        float tTotal             = tCollision+tPostCollisionBC+tPreCollisionBC+tEddyViscosity+tAcutator+tProbe+tExchange;
+        
+        VF_LOG_INFO(" --- Collision \t {}%",        (tCollision/tTotal)*100 );
+        VF_LOG_INFO(" --- PostCollisionBCs \t {}%", (tPostCollisionBC/tTotal)*100 );
+        VF_LOG_INFO(" --- PreCollisionBCs \t {}%",  (tPreCollisionBC/tTotal)*100 );
+        VF_LOG_INFO(" --- Eddy viscosity \t {}%",   (tEddyViscosity/tTotal)*100 );
+        VF_LOG_INFO(" --- Actuators \t {}%",        (tAcutator/tTotal)*100 );
+        VF_LOG_INFO(" --- Probes \t\t {}%",           (tProbe/tTotal)*100 );
+        VF_LOG_INFO(" --- Data exchange \t {}%",    (tExchange/tTotal)*100 );
+    }
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Output/TimeStepTimer.h b/src/gpu/VirtualFluids_GPU/Output/TimeStepTimer.h
new file mode 100644
index 0000000000000000000000000000000000000000..982d1ce56bfadb7eddfd3d34d8d6b01ac6f92233
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Output/TimeStepTimer.h
@@ -0,0 +1,59 @@
+#ifndef TIMESTEPTIMER_H
+#define TIMESTEPTIMER_H
+
+#include "helper_cuda.h"
+#include <cuda_runtime.h>
+#include "Core/DataTypes.h"
+#include "UbScheduler.h"
+#include "Parameter/Parameter.h"
+
+#include "Timer.h"
+
+class TimeStepTimer
+{
+    public:
+    TimeStepTimer(std::string _name, uint _tActivate): name(_name), tActivate(_tActivate)
+    {
+        
+    };
+    
+    ~TimeStepTimer(){};
+
+    void startTotalTimer            (uint t);
+    void stopTotalTimer             (uint t);
+    void startCollisionTimer        (uint t);
+    void stopCollisionTimer         (uint t);
+    void startPostCollisionBCTimer  (uint t);
+    void stopPostCollisionBCTimer   (uint t);
+    void startPreCollisionBCTimer   (uint t);
+    void stopPreCollisionBCTimer    (uint t);
+    void startEddyViscosityTimer    (uint t);
+    void stopEddyViscosityTimer     (uint t);
+    void startActuatorTimer         (uint t);
+    void stopActuatorTimer          (uint t);
+    void startProbeTimer            (uint t);
+    void stopProbeTimer             (uint t);
+    void startExchangeTimer         (uint t);
+    void stopExchangeTimer          (uint t);
+
+    void resetTimers(uint t);
+    void outputPerformance(uint t, Parameter* para);
+
+    private:
+    
+    Timer* totalTimer           = new Timer("total");
+    Timer* collisionTimer       = new Timer("collision");
+    Timer* postCollisionBCTimer = new Timer("postCollisionBC");
+    Timer* preCollisionBCTimer  = new Timer("preCollisionBC");
+    Timer* eddyViscosityTimer   = new Timer("eddyViscosity");
+    Timer* actuatorTimer        = new Timer("actuator");
+    Timer* probeTimer           = new Timer("probes");
+    Timer* exchangeTimer        = new Timer("exchange");
+    
+    std::string name;
+    uint tActivate;
+};
+
+
+
+#endif 
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Output/Timer.cpp b/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..823364a22eca41517816c1fdb61dfdc96ef1d961
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Output/Timer.cpp
@@ -0,0 +1,51 @@
+
+#include <iostream>
+#include <cuda_runtime.h>
+#include "UbScheduler.h"
+#include "Timer.h"
+
+
+void Timer::initTimer()
+{
+    cudaEventCreate(&this->start_t);
+    cudaEventCreate(&this->stop_t );
+}
+
+void Timer::startTimer()
+{ 
+    checkCudaErrors(cudaEventRecord(this->start_t)); 
+}
+
+void Timer::stopTimer()
+{
+        checkCudaErrors(cudaEventRecord(this->stop_t));
+        checkCudaErrors(cudaEventSynchronize(this->stop_t));
+        checkCudaErrors(cudaEventElapsedTime(&this->elapsedTime, this->start_t, this->stop_t));
+        this->totalElapsedTime += this->elapsedTime;
+}
+
+void Timer::resetTimer()
+{
+        this->elapsedTime = 0.0;
+        this->totalElapsedTime = 0.0;
+}
+
+void Timer::outputPerformance(uint t, Parameter* para)
+{
+    real fnups      = 0.0;
+    real bandwidth  = 0.0;
+    
+    for (int lev=para->getCoarse(); lev <= para->getFine(); lev++)
+    {
+        fnups       += 1000.0 * (t-para->getTStart()) * para->getParH(lev)->size_Mat_SP * pow(2.,lev) / (this->totalElapsedTime*1.0E6);
+        bandwidth   += (27.0+1.0) * 4.0 * 1000.0 * (t-para->getTStart()) * para->getParH(lev)->size_Mat_SP  / (this->totalElapsedTime*1.0E9);
+    }
+
+    if(this->firstOutput)
+    {
+        VF_LOG_INFO(" --- {} --- Processing time (ms) \t Nups in Mio \t Bandwidth in GB/sec", this->name );
+        this->firstOutput = false;
+    }
+
+    VF_LOG_INFO(" --- {} --- {}/{} \t {} \t {}", this->name, this->elapsedTime, this->totalElapsedTime, fnups, bandwidth  );
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Output/Timer.h b/src/gpu/VirtualFluids_GPU/Output/Timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6432b347458e68a5089aea3de625017d6facd34b
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/Output/Timer.h
@@ -0,0 +1,47 @@
+#ifndef TIMER_H
+#define TIMER_H
+
+#include "helper_cuda.h"
+#include <cuda_runtime.h>
+#include "Core/DataTypes.h"
+
+#include "UbScheduler.h"
+#include "logger/Logger.h"
+#include "Parameter/Parameter.h"
+
+class Timer
+{
+    public:
+    Timer(std::string _name): name(_name)
+    {
+        this->initTimer();
+    };
+    
+    ~Timer()
+    {
+        cudaEventDestroy(this->start_t);
+        cudaEventDestroy(this->stop_t);
+    };
+
+    void initTimer();
+    void startTimer();
+    void stopTimer();
+    void resetTimer();
+    void outputPerformance(uint t, Parameter* para);
+
+    float getElapsedTime(){ return this->elapsedTime; }
+    float getTotalElapsedTime(){ return this->totalElapsedTime; }
+
+    private:
+    
+    cudaEvent_t start_t, stop_t;
+    float elapsedTime = 0.0;
+    float totalElapsedTime = 0.0;
+    std::string name;
+
+    bool firstOutput = true;
+};
+
+
+
+#endif 
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
index b29db903ad0ad5e4ea9c18e36f152d81c7b952c6..c10b5b690bf8aa2c819b26acf1509f337debafe3 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.cpp
@@ -109,12 +109,15 @@ void Parameter::readConfigData(const vf::basics::ConfigurationFile &configData)
     //////////////////////////////////////////////////////////////////////////
     if (configData.contains("UseMeasurePoints"))
         this->setUseMeasurePoints(configData.getValue<bool>("UseMeasurePoints"));
-    //////////////////////////////////////////////////////////////////////////
+	//////////////////////////////////////////////////////////////////////////
     if (configData.contains("UseWale"))
         this->setUseWale(configData.getValue<bool>("UseWale"));
 	//////////////////////////////////////////////////////////////////////////
     if (configData.contains("UseAMD"))
         this->setUseAMD(configData.getValue<bool>("UseAMD"));
+	//////////////////////////////////////////////////////////////////////////
+    if (configData.contains("SGSconstant"))
+        this->setSGSConstant(configData.getValue<real>("SGSconstant"));
     //////////////////////////////////////////////////////////////////////////
     if (configData.contains("UseInitNeq"))
         this->setUseInitNeq(configData.getValue<bool>("UseInitNeq"));
@@ -572,11 +575,12 @@ void Parameter::setForcing(real forcingX, real forcingY, real forcingZ)
 	this->hostForcing[2] = forcingZ;
 }
 void Parameter::setQuadricLimiters(real quadricLimiterP, real quadricLimiterM, real quadricLimiterD)
-{
+{	
 	this->hostQuadricLimiters[0] = quadricLimiterP;
 	this->hostQuadricLimiters[1] = quadricLimiterM;
 	this->hostQuadricLimiters[2] = quadricLimiterD;
 }
+
 void Parameter::setPhi(real inPhi)
 {
 	Phi = inPhi;
@@ -866,6 +870,10 @@ void Parameter::setSGSConstant(real SGSConstant)
 {
 	ic.SGSConstant = SGSConstant;
 }
+void Parameter::setHasWallModelMonitor(bool hasWallModelMonitor)
+{
+	ic.hasWallModelMonitor = hasWallModelMonitor;
+}
 void Parameter::setUseInitNeq(bool useInitNeq)
 {
 	ic.isInitNeq = useInitNeq;
@@ -2261,6 +2269,10 @@ real Parameter::getSGSConstant()
 {
 	return ic.SGSConstant;
 }
+bool Parameter::getHasWallModelMonitor()
+{
+	return ic.hasWallModelMonitor;
+}
 bool Parameter::getUseInitNeq()
 {
 	return ic.isInitNeq;
diff --git a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
index ea9b42f94e63a36c2fd0b6b669e959a8b4810e8f..48cf410ff8b700ef69d26883c5ef22048f9fd322 100644
--- a/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
+++ b/src/gpu/VirtualFluids_GPU/Parameter/Parameter.h
@@ -189,9 +189,9 @@ struct LBMSimulationParameter
     unsigned int mem_size_kFC_off;
 
     // BC's////////////////////
-    QforBoundaryConditions QWall, Qinflow, Qoutflow, QSlip;
-    unsigned int kQ = 0, kInflowQ = 0, kOutflowQ = 0, kSlipQ = 0;
-    unsigned int kQread, kInflowQread, kOutflowQread, kSlipQread;
+    QforBoundaryConditions QWall, Qinflow, Qoutflow, QSlip, QStress;
+    unsigned int kQ = 0, kInflowQ = 0, kOutflowQ = 0, kSlipQ = 0, kStressQ = 0;
+    unsigned int kQread, kInflowQread, kOutflowQread, kSlipQread, kStressQread;
 
     QforBoundaryConditions QpressX0, QpressX1, QpressY0, QpressY1, QpressZ0, QpressZ1;
     QforBoundaryConditions QPropeller;
@@ -203,6 +203,9 @@ struct LBMSimulationParameter
     QforBoundaryConditions QInlet, QOutlet, QPeriodic;
     unsigned int kInletQread, kOutletQread;
     unsigned int kPressQ = 0, kPressQread;
+
+    WallModelParameters wallModel;
+    
     // testRoundoffError
     Distributions27 kDistTestRE;
 
@@ -453,6 +456,7 @@ public:
     void setUseTurbulentViscosity(bool useTurbulentViscosity);
     void setUseAMD( bool useAMD);
     void setSGSConstant( real SGSConstant);
+    void setHasWallModelMonitor(bool hasWallModelMonitor);
     void setUseInitNeq(bool useInitNeq);
     void setSimulatePorousMedia(bool simulatePorousMedia);
     void setIsF3(bool isF3);
@@ -714,6 +718,7 @@ public:
     bool getUseTurbulentViscosity();
     bool getUseAMD();
     real getSGSConstant();
+    bool getHasWallModelMonitor();
     bool getUseInitNeq();
     bool getSimulatePorousMedia();
     bool getIsF3();
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ac087ccfec2dc71439054921c8500568c9c070d
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.cu
@@ -0,0 +1,457 @@
+#include "Probe.h"
+#include "PlanarAverageProbe.h"
+
+#include <cuda/CudaGrid.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/device_ptr.h>
+#include <thrust/inner_product.h>
+
+#include "Parameter/Parameter.h"
+#include "DataStructureInitializer/GridProvider.h"
+#include "GPU/CudaMemoryManager.h"
+
+#include <algorithm>
+
+///////////////////////////////////////////////////////////////////////////////////
+/// Functors for thrust reductions
+///////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct pow2 : public thrust::unary_function<T,T>
+{
+  __host__ __device__ T operator()(const T &x) const
+  {
+    return x * x;
+  }
+};
+
+template<typename T>
+struct pow3 : public thrust::unary_function<T,T>
+{
+  __host__ __device__ T operator()(const T &x) const
+  {
+    return x * x * x;
+  }
+};
+
+template<typename T>
+struct pow4 : public thrust::unary_function<T,T>
+{
+  __host__ __device__ T operator()(const T &x) const
+  {
+    return x * x * x * x;
+  }
+};
+
+struct nth_moment
+{
+    const float mean;
+    const int n;
+
+    nth_moment(float _mean, int _n) : mean(_mean), n(_n) {}
+
+    __host__ __device__
+        float operator()(const float& x) const { 
+            
+            real fluctuation = x-mean;
+            real moment = fluctuation;
+            for(int i = 1; i<n; i++) moment *= fluctuation;
+            
+            return moment;
+        }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////
+
+__global__ void moveIndicesInPosNormalDir( uint* pointIndices, uint nPoints, uint* neighborNormal, real* coordsX, real* coordsY, real* coordsZ )
+{
+    const uint x = threadIdx.x; 
+    const uint y = blockIdx.x;
+    const uint z = blockIdx.y;
+
+    const uint nx = blockDim.x;
+    const uint ny = gridDim.x;
+
+    const uint node = nx*(ny*z + y) + x;
+
+    if(node>=nPoints) return;
+
+    uint k = pointIndices[node];
+
+    pointIndices[node] = neighborNormal[k];
+}
+
+__global__ void moveIndicesInNegNormalDir( uint* pointIndices, uint nPoints, uint* neighborWSB, uint* neighborInplane1, uint* neighborInplane2, real* coordsX, real* coordsY, real* coordsZ )
+{
+    const uint x = threadIdx.x; 
+    const uint y = blockIdx.x;
+    const uint z = blockIdx.y;
+
+    const uint nx = blockDim.x;
+    const uint ny = gridDim.x;
+
+    const uint node = nx*(ny*z + y) + x;
+
+    if(node>=nPoints) return;
+
+    uint k = pointIndices[node];
+
+    pointIndices[node] = neighborWSB[neighborInplane1[neighborInplane2[k]]];
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+
+bool PlanarAverageProbe::isAvailableStatistic(Statistic _variable)
+{
+    bool isAvailable;
+
+    switch (_variable)
+    {
+        case Statistic::Instantaneous:
+        case Statistic::Means:
+        case Statistic::Variances:
+            isAvailable = false;
+            break;
+        case Statistic::SpatialMeans:
+        case Statistic::SpatioTemporalMeans:
+        case Statistic::SpatialCovariances:
+        case Statistic::SpatioTemporalCovariances:
+        case Statistic::SpatialSkewness:
+        case Statistic::SpatioTemporalSkewness:
+        case Statistic::SpatialFlatness:
+        case Statistic::SpatioTemporalFlatness:
+            isAvailable =  true;
+            break;
+        default:
+            isAvailable =  false;
+    }
+    return isAvailable;
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+std::vector<PostProcessingVariable> PlanarAverageProbe::getPostProcessingVariables(Statistic statistic)
+{
+    std::vector<PostProcessingVariable> postProcessingVariables;
+    switch (statistic)
+    {
+    case Statistic::SpatialMeans:
+        postProcessingVariables.push_back( PostProcessingVariable("vx_spatMean",  velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vy_spatMean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vz_spatMean",  this->velocityRatio) );
+        break;
+    case Statistic::SpatioTemporalMeans:
+        postProcessingVariables.push_back( PostProcessingVariable("vx_spatTmpMean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vy_spatTmpMean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vz_spatTmpMean",  this->velocityRatio) );
+        break;
+    case Statistic::SpatialCovariances:
+        postProcessingVariables.push_back( PostProcessingVariable("vxvx_spatMean",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vyvy_spatMean",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vzvz_spatMean",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vxvy_spatMean",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vxvz_spatMean",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vyvz_spatMean",  pow(this->velocityRatio, 2.0)) );
+        break;
+    case Statistic::SpatioTemporalCovariances:
+        postProcessingVariables.push_back( PostProcessingVariable("vxvx_spatTmpMean",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vyvy_spatTmpMean",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vzvz_spatTmpMean",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vxvy_spatTmpMean",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vxvz_spatTmpMean",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vyvz_spatTmpMean",  pow(this->velocityRatio, 2.0)) );
+        break;
+    case Statistic::SpatialSkewness:
+        postProcessingVariables.push_back( PostProcessingVariable("Sx_spatMean",  1.0) );
+        postProcessingVariables.push_back( PostProcessingVariable("Sy_spatMean",  1.0) );
+        postProcessingVariables.push_back( PostProcessingVariable("Sz_spatMean",  1.0) );
+        break;
+    case Statistic::SpatioTemporalSkewness:
+        postProcessingVariables.push_back( PostProcessingVariable("Sx_spatTmpMean",  1.0) );
+        postProcessingVariables.push_back( PostProcessingVariable("Sy_spatTmpMean",  1.0) );
+        postProcessingVariables.push_back( PostProcessingVariable("Sz_spatTmpMean",  1.0) );
+        break;
+    case Statistic::SpatialFlatness:
+        postProcessingVariables.push_back( PostProcessingVariable("Fx_spatMean",  1.0) );
+        postProcessingVariables.push_back( PostProcessingVariable("Fy_spatMean",  1.0) );
+        postProcessingVariables.push_back( PostProcessingVariable("Fz_spatMean",  1.0) );
+        break;
+    case Statistic::SpatioTemporalFlatness:
+        postProcessingVariables.push_back( PostProcessingVariable("Fx_spatTmpMean",  1.0) );
+        postProcessingVariables.push_back( PostProcessingVariable("Fy_spatTmpMean",  1.0) );
+        postProcessingVariables.push_back( PostProcessingVariable("Fz_spatTmpMean",  1.0) );
+        break;
+
+    default:
+        printf("Statistic unavailable in PlanarAverageProbe\n");
+        assert(false);
+        break;
+    }
+    return postProcessingVariables;
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+
+void PlanarAverageProbe::findPoints(Parameter* para, GridProvider* gridProvider, std::vector<int>& probeIndices_level,
+                            std::vector<real>& distX_level, std::vector<real>& distY_level, std::vector<real>& distZ_level,      
+                            std::vector<real>& pointCoordsX_level, std::vector<real>& pointCoordsY_level, std::vector<real>& pointCoordsZ_level,
+                            int level)
+{
+    real dx = abs(para->getParH(level)->coordX_SP[1]-para->getParH(level)->coordX_SP[para->getParH(level)->neighborX_SP[1]]);
+    
+    real /* *pointCoordsInplane1_par, *pointCoordsInplane2_par,*/ *pointCoordsNormal_par;
+    std::vector<real> *pointCoordsInplane1, *pointCoordsInplane2, *pointCoordsNormal;
+    
+    if(this->planeNormal == 'x'){  
+                                    pointCoordsNormal       = &pointCoordsX_level; 
+                                    pointCoordsInplane1     = &pointCoordsY_level; 
+                                    pointCoordsInplane2     = &pointCoordsZ_level;
+                                    pointCoordsNormal_par   = para->getParH(level)->coordX_SP; 
+                                    // pointCoordsInplane1_par = para->getParH(level)->coordY_SP; 
+                                    // pointCoordsInplane2_par = para->getParH(level)->coordZ_SP;
+                                }
+    if(this->planeNormal == 'y'){  
+                                    pointCoordsNormal       = &pointCoordsY_level; 
+                                    pointCoordsInplane1     = &pointCoordsX_level; 
+                                    pointCoordsInplane2     = &pointCoordsZ_level;
+                                    pointCoordsNormal_par   = para->getParH(level)->coordY_SP; 
+                                    // pointCoordsInplane1_par = para->getParH(level)->coordX_SP; 
+                                    // pointCoordsInplane2_par = para->getParH(level)->coordZ_SP;
+                                }
+    if(this->planeNormal == 'z'){  
+                                    pointCoordsNormal       = &pointCoordsZ_level; 
+                                    pointCoordsInplane1     = &pointCoordsX_level; 
+                                    pointCoordsInplane2     = &pointCoordsY_level;
+                                    pointCoordsNormal_par   = para->getParH(level)->coordZ_SP; 
+                                    // pointCoordsInplane1_par = para->getParH(level)->coordX_SP; 
+                                    // pointCoordsInplane2_par = para->getParH(level)->coordY_SP;
+                                }
+
+    // Find all points along the normal direction
+    for(uint j=1; j<para->getParH(level)->size_Mat_SP; j++ )
+    {
+        if(para->getParH(level)->geoSP[j] == GEO_FLUID)
+        {   
+            if( std::find(pointCoordsNormal->begin(), pointCoordsNormal->end(), pointCoordsNormal_par[j]) == pointCoordsNormal->end())  
+            {
+                pointCoordsNormal->push_back( pointCoordsNormal_par[j] );
+                pointCoordsInplane1->push_back(999999.);
+                pointCoordsInplane2->push_back(999999.);
+            }
+        }
+    }
+    std::sort(pointCoordsNormal->begin(), pointCoordsNormal->end());
+    
+    // Find all pointCoords in the first plane 
+    for(uint j=1; j<para->getParH(level)->size_Mat_SP; j++ )
+    {
+        if( para->getParH(level)->geoSP[j] == GEO_FLUID && pointCoordsNormal_par[j] == pointCoordsNormal->at(0)) 
+        {
+            //not needed in current state, might become relevant for two-point correlations
+            // pointCoordsNormal->push_back( pointCoordsNormal_par[j] ); 
+            // pointCoordsInplane1->push_back( pointCoordsInplane1_par[j] );
+            // pointCoordsInplane2->push_back( pointCoordsInplane2_par[j] );
+
+            probeIndices_level.push_back(j);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+
+void PlanarAverageProbe::calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, uint t, int level)
+{   
+    // Definition of normal and inplane directions for moveIndices kernels
+    uint *neighborNormal, *neighborInplane1, *neighborInplane2;
+    if( this->planeNormal == 'x' )
+    {
+        neighborNormal   = para->getParD(level)->neighborX_SP;
+        neighborInplane1 = para->getParD(level)->neighborY_SP;
+        neighborInplane2 = para->getParD(level)->neighborZ_SP;
+    }
+    if( this->planeNormal == 'y' )
+    {
+        neighborNormal   = para->getParD(level)->neighborY_SP;
+        neighborInplane1 = para->getParD(level)->neighborX_SP;
+        neighborInplane2 = para->getParD(level)->neighborZ_SP;
+    }
+    if( this->planeNormal == 'z' )
+    {
+        neighborNormal   = para->getParD(level)->neighborZ_SP;
+        neighborInplane1 = para->getParD(level)->neighborX_SP;
+        neighborInplane2 = para->getParD(level)->neighborY_SP;
+    }
+
+    bool doTmpAveraging = (t>this->getTStartTmpAveraging());
+
+    // Pointer casts to use device arrays in thrust reductions
+    thrust::device_ptr<uint> indices_thrust = thrust::device_pointer_cast(probeStruct->pointIndicesD);
+    thrust::device_ptr<real> vx_thrust = thrust::device_pointer_cast(para->getParD(level)->vx_SP);
+    thrust::device_ptr<real> vy_thrust = thrust::device_pointer_cast(para->getParD(level)->vy_SP);
+    thrust::device_ptr<real> vz_thrust = thrust::device_pointer_cast(para->getParD(level)->vz_SP);
+
+    real N = (real)probeStruct->nIndices;
+    real n = (real)probeStruct->vals;
+    uint nPoints = probeStruct->nPoints;
+    // Permutation iterators for direct iteration over the velocities of the planes
+    typedef thrust::device_vector<real>::iterator valIterator;
+    typedef thrust::device_vector<uint>::iterator indIterator;
+    thrust::permutation_iterator<valIterator, indIterator> vx_iter_begin(vx_thrust, indices_thrust);
+    thrust::permutation_iterator<valIterator, indIterator> vx_iter_end  (vx_thrust, indices_thrust+probeStruct->nIndices);
+    thrust::permutation_iterator<valIterator, indIterator> vy_iter_begin(vy_thrust, indices_thrust);
+    thrust::permutation_iterator<valIterator, indIterator> vy_iter_end  (vy_thrust, indices_thrust+probeStruct->nIndices);
+    thrust::permutation_iterator<valIterator, indIterator> vz_iter_begin(vz_thrust, indices_thrust);
+    thrust::permutation_iterator<valIterator, indIterator> vz_iter_end  (vz_thrust, indices_thrust+probeStruct->nIndices);
+
+    for( uint i=0; i<nPoints; i++ )
+    {
+        uint node = this->isEvenTAvg? i : nPoints-1-i; // Note, loop moves in positive normal dir at even calls and in negative normal dir in odd calls
+
+        if(probeStruct->quantitiesH[int(Statistic::SpatialMeans)])
+        {
+            // Compute the instantaneous spatial means of the velocity moments 
+            real spatMean_vx = thrust::reduce(vx_iter_begin, vx_iter_end)/N;
+            real spatMean_vy = thrust::reduce(vy_iter_begin, vy_iter_end)/N;
+            real spatMean_vz = thrust::reduce(vz_iter_begin, vz_iter_end)/N;
+
+            uint arrOff = probeStruct->arrayOffsetsH[int(Statistic::SpatialMeans)];
+            probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+node] = spatMean_vx;
+            probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+node] = spatMean_vy;
+            probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+node] = spatMean_vz;
+
+            if(probeStruct->quantitiesH[int(Statistic::SpatioTemporalMeans)] && doTmpAveraging)
+            {
+            uint arrOff = probeStruct->arrayOffsetsH[int(Statistic::SpatioTemporalMeans)];
+            real spatTmpMean_vx_old = probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+node];
+            real spatTmpMean_vy_old = probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+node];
+            real spatTmpMean_vz_old = probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+node];
+
+            probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+node] += (spatMean_vx-spatTmpMean_vx_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+node] += (spatMean_vy-spatTmpMean_vy_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+node] += (spatMean_vz-spatTmpMean_vz_old)/n;
+            }
+        
+            if(probeStruct->quantitiesH[int(Statistic::SpatialCovariances)])
+            {   // <u_i' u_j'> = <u_i u_j> - <u_i>*<u_i> 
+                real vx2 = thrust::transform_reduce(vx_iter_begin, vx_iter_end, pow2<real>(), 0.f, thrust::plus<real>())/N;
+                real vy2 = thrust::transform_reduce(vy_iter_begin, vy_iter_end, pow2<real>(), 0.f, thrust::plus<real>())/N;
+                real vz2 = thrust::transform_reduce(vz_iter_begin, vz_iter_end, pow2<real>(), 0.f, thrust::plus<real>())/N;
+                real vxvy = thrust::inner_product(vx_iter_begin, vx_iter_end, vy_iter_begin, 0.f)/N;
+                real vxvz = thrust::inner_product(vx_iter_begin, vx_iter_end, vz_iter_begin, 0.f)/N;
+                real vyvz = thrust::inner_product(vy_iter_begin, vy_iter_end, vz_iter_begin, 0.f)/N;
+                real spatMean_vxvx = vx2-spatMean_vx*spatMean_vx;
+                real spatMean_vyvy = vy2-spatMean_vy*spatMean_vy;
+                real spatMean_vzvz = vz2-spatMean_vz*spatMean_vz;
+                real spatMean_vxvy = vxvy-spatMean_vx*spatMean_vy;
+                real spatMean_vxvz = vxvz-spatMean_vx*spatMean_vz;
+                real spatMean_vyvz = vyvz-spatMean_vy*spatMean_vz;
+
+                uint arrOff = probeStruct->arrayOffsetsH[int(Statistic::SpatialCovariances)];
+                probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+node] = spatMean_vxvx;
+                probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+node] = spatMean_vyvy;
+                probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+node] = spatMean_vzvz;
+                probeStruct->quantitiesArrayH[(arrOff+3)*nPoints+node] = spatMean_vxvy;
+                probeStruct->quantitiesArrayH[(arrOff+4)*nPoints+node] = spatMean_vxvz;
+                probeStruct->quantitiesArrayH[(arrOff+5)*nPoints+node] = spatMean_vyvz;
+
+                if(probeStruct->quantitiesH[int(Statistic::SpatioTemporalCovariances)] && doTmpAveraging)
+                {
+                    uint arrOff = probeStruct->arrayOffsetsH[int(Statistic::SpatioTemporalCovariances)];
+                    real spatTmpMean_vxvx_old = probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+node];
+                    real spatTmpMean_vyvy_old = probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+node];
+                    real spatTmpMean_vzvz_old = probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+node];
+                    real spatTmpMean_vxvy_old = probeStruct->quantitiesArrayH[(arrOff+3)*nPoints+node];
+                    real spatTmpMean_vxvz_old = probeStruct->quantitiesArrayH[(arrOff+4)*nPoints+node];
+                    real spatTmpMean_vyvz_old = probeStruct->quantitiesArrayH[(arrOff+5)*nPoints+node];
+
+                    probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+node] += (spatMean_vxvx-spatTmpMean_vxvx_old)/n;
+                    probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+node] += (spatMean_vyvy-spatTmpMean_vyvy_old)/n;
+                    probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+node] += (spatMean_vzvz-spatTmpMean_vzvz_old)/n;
+                    probeStruct->quantitiesArrayH[(arrOff+3)*nPoints+node] += (spatMean_vxvy-spatTmpMean_vxvy_old)/n;
+                    probeStruct->quantitiesArrayH[(arrOff+4)*nPoints+node] += (spatMean_vxvz-spatTmpMean_vxvz_old)/n;
+                    probeStruct->quantitiesArrayH[(arrOff+5)*nPoints+node] += (spatMean_vyvz-spatTmpMean_vyvz_old)/n;
+                }
+
+                if(probeStruct->quantitiesH[int(Statistic::SpatialSkewness)])
+                {   // <u_i'^3> = <u_i^3> - <u_i>^3 - 3 <u_i> <u_i'^2>
+                    // real vx3 = thrust::transform_reduce(vx_iter_begin, vx_iter_end, pow3<real>(), 0.f, thrust::plus<real>())/N;
+                    // real vy3 = thrust::transform_reduce(vy_iter_begin, vy_iter_end, pow3<real>(), 0.f, thrust::plus<real>())/N;
+                    // real vz3 = thrust::transform_reduce(vz_iter_begin, vz_iter_end, pow3<real>(), 0.f, thrust::plus<real>())/N;
+                    real spatMean_vxvxvx = thrust::transform_reduce(vx_iter_begin, vx_iter_end, nth_moment(spatMean_vx, 3), 0.f, thrust::plus<real>())/N; 
+                    //vx3 - spatMean_vx*spatMean_vx*spatMean_vx - 3*spatMean_vx*spatMean_vxvx; -> alternative only using vx3, etc. but containing some bug. Potentially better in terms of round-off errors.
+                    real spatMean_vyvyvy = thrust::transform_reduce(vy_iter_begin, vy_iter_end, nth_moment(spatMean_vy, 3), 0.f, thrust::plus<real>())/N; 
+                    //vy3 - spatMean_vy*spatMean_vy*spatMean_vy - 3*spatMean_vy*spatMean_vzvz;
+                    real spatMean_vzvzvz = thrust::transform_reduce(vz_iter_begin, vz_iter_end, nth_moment(spatMean_vz, 3), 0.f, thrust::plus<real>())/N; 
+                    //vz3 - spatMean_vz*spatMean_vz*spatMean_vz - 3*spatMean_vz*spatMean_vzvz;
+                    real spatMean_Sx = spatMean_vxvxvx/pow(spatMean_vxvx, 1.5f);
+                    real spatMean_Sy = spatMean_vyvyvy/pow(spatMean_vyvy, 1.5f);
+                    real spatMean_Sz = spatMean_vzvzvz/pow(spatMean_vzvz, 1.5f);
+
+                    uint arrOff = probeStruct->arrayOffsetsH[int(Statistic::SpatialSkewness)];
+                    probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+node] = spatMean_Sx;
+                    probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+node] = spatMean_Sy;
+                    probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+node] = spatMean_Sz;
+
+                    if(probeStruct->quantitiesH[int(Statistic::SpatioTemporalSkewness)] && doTmpAveraging)
+                    {
+                        uint arrOff = probeStruct->arrayOffsetsH[int(Statistic::SpatioTemporalSkewness)];
+                        real spatTmpMean_Sx_old = probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+node];
+                        real spatTmpMean_Sy_old = probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+node];
+                        real spatTmpMean_Sz_old = probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+node];
+
+                        probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+node] += (spatMean_Sx-spatTmpMean_Sx_old)/n;
+                        probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+node] += (spatMean_Sy-spatTmpMean_Sy_old)/n;
+                        probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+node] += (spatMean_Sz-spatTmpMean_Sz_old)/n;
+                    }
+
+                    if(probeStruct->quantitiesH[int(Statistic::SpatialFlatness)])
+                    {   // <u_i'^4> = <u_i^4> - <u_i>^4 - 6 <u_i>^2 <u_i'^2> - 4 <u> <u'^3>
+                        // real vx4 = thrust::transform_reduce(vx_iter_begin, vx_iter_end, pow4<real>(), 0.f, thrust::plus<real>())/N;
+                        // real vy4 = thrust::transform_reduce(vy_iter_begin, vy_iter_end, pow4<real>(), 0.f, thrust::plus<real>())/N;
+                        // real vz4 = thrust::transform_reduce(vz_iter_begin, vz_iter_end, pow4<real>(), 0.f, thrust::plus<real>())/N;
+                        real spatMean_vxvxvxvx = thrust::transform_reduce(vx_iter_begin, vx_iter_end, nth_moment(spatMean_vx, 4), 0.f, thrust::plus<real>())/N; //vx4 - spatMean_vx*spatMean_vx*spatMean_vx*spatMean_vx - 6*spatMean_vx*spatMean_vx*vx2 - 4*spatMean_vx*vx3;
+                        real spatMean_vyvyvyvy = thrust::transform_reduce(vy_iter_begin, vy_iter_end, nth_moment(spatMean_vy, 4), 0.f, thrust::plus<real>())/N; //vy4 - spatMean_vy*spatMean_vy*spatMean_vy*spatMean_vy - 6*spatMean_vy*spatMean_vx*vy2 - 4*spatMean_vy*vy3;
+                        real spatMean_vzvzvzvz = thrust::transform_reduce(vz_iter_begin, vz_iter_end, nth_moment(spatMean_vz, 4), 0.f, thrust::plus<real>())/N; //vz4 - spatMean_vz*spatMean_vz*spatMean_vz*spatMean_vz - 6*spatMean_vz*spatMean_vx*vz2 - 4*spatMean_vz*vz3;
+                        real spatMean_Fx = spatMean_vxvxvxvx/(spatMean_vxvx*spatMean_vxvx);
+                        real spatMean_Fy = spatMean_vyvyvyvy/(spatMean_vyvy*spatMean_vyvy);
+                        real spatMean_Fz = spatMean_vzvzvzvz/(spatMean_vzvz*spatMean_vzvz);
+
+                        uint arrOff = probeStruct->arrayOffsetsH[int(Statistic::SpatialFlatness)];
+                        probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+node] = spatMean_Fx;
+                        probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+node] = spatMean_Fy;
+                        probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+node] = spatMean_Fz;
+
+                        if(probeStruct->quantitiesH[int(Statistic::SpatioTemporalFlatness)] && doTmpAveraging)
+                        {
+                            uint arrOff = probeStruct->arrayOffsetsH[int(Statistic::SpatioTemporalFlatness)];
+                            real spatTmpMean_Fx_old = probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+node];
+                            real spatTmpMean_Fy_old = probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+node];
+                            real spatTmpMean_Fz_old = probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+node];
+
+                            probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+node] += (spatMean_Fx-spatTmpMean_Fx_old)/n;
+                            probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+node] += (spatMean_Fy-spatTmpMean_Fy_old)/n;
+                            probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+node] += (spatMean_Fz-spatTmpMean_Fz_old)/n;
+                        }
+                    }
+                }
+        }
+    }
+        if(i<probeStruct->nPoints-1)
+        {
+            vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, probeStruct->nIndices);
+            if(this->isEvenTAvg) 
+                moveIndicesInPosNormalDir<<<grid.grid, grid.threads>>>( probeStruct->pointIndicesD, probeStruct->nIndices, neighborNormal, para->getParD(level)->coordX_SP, para->getParD(level)->coordY_SP, para->getParD(level)->coordZ_SP );
+            else 
+                moveIndicesInNegNormalDir<<<grid.grid, grid.threads>>>( probeStruct->pointIndicesD, probeStruct->nIndices, para->getParD(level)->neighborWSB_SP, neighborInplane1, neighborInplane2, para->getParD(level)->coordX_SP, para->getParD(level)->coordY_SP, para->getParD(level)->coordZ_SP ); 
+        } 
+    }
+    this->isEvenTAvg=!this->isEvenTAvg;
+
+    getLastCudaError("PlanarAverageProbe::calculateQuantities execution failed");
+}
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.h
new file mode 100644
index 0000000000000000000000000000000000000000..7054f5fc7e02453418285281a0ea9cf9c32dc0c0
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlanarAverageProbe.h
@@ -0,0 +1,97 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PlanarAverageProbe.h
+//! \author Henrik Asmuth
+//! \date 13/05/2022
+//! \brief Probe computing statistics across planes spanning the entire domain
+//!
+//! Computes spatial statistics across x, y or z-normal planes defined by planeNormal. 
+//! The planes include all points of the domain at each respective position along that normal direction.
+//! The spatial statistics can additionally be averaged in time.
+//!
+//=======================================================================================
+
+#ifndef PlanarAverageProbe_H
+#define PlanarAverageProbe_H
+
+#include "Probe.h"
+
+__global__ void moveIndicesInNegNormalDir( uint* pointIndices, uint nPoints, uint* neighborWSB, uint* neighborInplane1, uint* neighborInplane2, real* coordsX, real* coordsY, real* coordsZ ); 
+
+__global__ void moveIndicesInPosNormalDir( uint* pointIndices, uint nPoints, uint* neighborNormal, real* coordsX, real* coordsY, real* coordsZ );
+
+///////////////////////////////////////////////////////////////////////////////////
+
+class PlanarAverageProbe : public Probe
+{
+public: 
+    PlanarAverageProbe(
+        const std::string _probeName,
+        const std::string _outputPath,
+        uint _tStartAvg,
+        uint _tStartTmpAvg,
+        uint _tAvg,
+        uint _tStartOut,
+        uint _tOut,
+        char _planeNormal
+    ):  Probe(_probeName, 
+             _outputPath,
+             _tStartAvg,
+             _tStartTmpAvg,
+             _tAvg,
+             _tStartOut, 
+             _tOut,
+             false,
+             false),
+        planeNormal(_planeNormal)
+
+    {   
+        assert(_planeNormal == 'x' || _planeNormal == 'y' || _planeNormal == 'z');
+    }
+
+
+private:
+    bool isAvailableStatistic(Statistic _variable) override;
+
+    std::vector<PostProcessingVariable> getPostProcessingVariables(Statistic variable) override;
+
+    void findPoints(Parameter* para, GridProvider* gridProvider, std::vector<int>& probeIndices_level,
+                    std::vector<real>& distX_level, std::vector<real>& distY_level, std::vector<real>& distZ_level,      
+                    std::vector<real>& pointCoordsX_level, std::vector<real>& pointCoordsY_level, std::vector<real>& pointCoordsZ_level,
+                    int level) override;
+    void calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, uint t, int level) override;
+
+private:
+    real posX, posY, posZ;
+    real deltaX, deltaY, deltaZ;
+    char planeNormal;
+    bool isEvenTAvg = true;
+};
+
+#endif
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.cu
index cf03d639add0c883793c6ffad041e7b6da6d98d3..15f10d8203a5d688da7f6bd18a976eaad5776b5c 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.cu
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.cu
@@ -1,3 +1,4 @@
+#include "Probe.h"
 #include "PlaneProbe.h"
 
 #include <cuda/CudaGrid.h>
@@ -10,6 +11,66 @@
 #include "DataStructureInitializer/GridProvider.h"
 #include "GPU/CudaMemoryManager.h"
 
+
+bool PlaneProbe::isAvailableStatistic(Statistic _variable)
+{
+    bool isAvailable;
+    switch (_variable)
+    {
+        case Statistic::Instantaneous:
+        case Statistic::Means:
+        case Statistic::Variances:
+            isAvailable = true;
+            break;
+        case Statistic::SpatialMeans:
+        case Statistic::SpatioTemporalMeans:
+        case Statistic::SpatialCovariances:
+        case Statistic::SpatioTemporalCovariances:
+        case Statistic::SpatialSkewness:
+        case Statistic::SpatioTemporalSkewness:
+        case Statistic::SpatialFlatness:
+        case Statistic::SpatioTemporalFlatness:
+            isAvailable = false;
+            break;
+        default:
+            isAvailable = false;
+    }
+    return isAvailable;
+}
+
+
+std::vector<PostProcessingVariable> PlaneProbe::getPostProcessingVariables(Statistic statistic)
+{
+    std::vector<PostProcessingVariable> postProcessingVariables;
+    switch (statistic)
+    {
+    case Statistic::Instantaneous:
+        postProcessingVariables.push_back( PostProcessingVariable("vx",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vy",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vz",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("rho", this->densityRatio ) );
+        break;
+    case Statistic::Means:
+        postProcessingVariables.push_back( PostProcessingVariable("vx_mean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vy_mean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vz_mean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("rho_mean", this->densityRatio ) );
+        break;
+    case Statistic::Variances:
+        postProcessingVariables.push_back( PostProcessingVariable("vx_var",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vy_var",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vz_var",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("rho_var", pow(this->densityRatio,  2.0)) );
+        break;
+
+    default:
+        printf("Statistic unavailable in PlaneProbe\n");
+        assert(false);
+        break;
+    }
+    return postProcessingVariables;
+}
+
 void PlaneProbe::findPoints(Parameter* para, GridProvider* gridProvider, std::vector<int>& probeIndices_level,
                             std::vector<real>& distX_level, std::vector<real>& distY_level, std::vector<real>& distZ_level,      
                             std::vector<real>& pointCoordsX_level, std::vector<real>& pointCoordsY_level, std::vector<real>& pointCoordsZ_level,
@@ -39,13 +100,11 @@ void PlaneProbe::findPoints(Parameter* para, GridProvider* gridProvider, std::ve
     }
 }
 
-void PlaneProbe::calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, int level)
+void PlaneProbe::calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, uint t, int level)
 {
     vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, probeStruct->nPoints);
-    interpQuantities<<<grid.grid, grid.threads>>>(  probeStruct->pointIndicesD, probeStruct->nPoints, probeStruct->vals,
-                                                    probeStruct->distXD, probeStruct->distYD, probeStruct->distZD,
-                                                    para->getParD(level)->vx_SP, para->getParD(level)->vy_SP, para->getParD(level)->vz_SP, para->getParD(level)->rho_SP, 
-                                                    para->getParD(level)->neighborX_SP, para->getParD(level)->neighborY_SP, para->getParD(level)->neighborZ_SP, 
-                                                    probeStruct->quantitiesD, probeStruct->arrayOffsetsD, probeStruct->quantitiesArrayD, false);
-
+    calcQuantitiesKernel<<<grid.grid, grid.threads>>>(  probeStruct->pointIndicesD, probeStruct->nPoints, probeStruct->vals,
+    para->getParD(level)->vx_SP, para->getParD(level)->vy_SP, para->getParD(level)->vz_SP, para->getParD(level)->rho_SP, 
+    para->getParD(level)->neighborX_SP, para->getParD(level)->neighborY_SP, para->getParD(level)->neighborZ_SP, 
+    probeStruct->quantitiesD, probeStruct->arrayOffsetsD, probeStruct->quantitiesArrayD);
 }
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h
index 1eb8197d4fcaa2ee44fd929af913c3c187a3dcdf..3440c01020f9b3505be7148024e47373b76648ff 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PlaneProbe.h
@@ -1,3 +1,41 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PlaneProbe.h
+//! \author Henry Korb, Henrik Asmuth
+//! \date 13/05/2022
+//! \brief Probe computing point-wise statistics for a set of points across a plane
+//!
+//! The set of points can be defined by providing a list or on an x-normal plane.
+//! All statistics are temporal.
+//!
+//=======================================================================================
+
 #ifndef PlaneProbe_H
 #define PlaneProbe_H
 
@@ -10,13 +48,18 @@ public:
         const std::string _probeName,
         const std::string _outputPath,
         uint _tStartAvg,
+        uint _tAvg,
         uint _tStartOut,
         uint _tOut
     ): Probe(_probeName, 
              _outputPath,
              _tStartAvg, 
+             0,
+             _tAvg,
              _tStartOut, 
-             _tOut)
+             _tOut,
+             true,
+             false)
     {}
 
     void setProbePlane(real _posX, real _posY, real _posZ, real _deltaX, real _deltaY, real _deltaZ)
@@ -30,11 +73,15 @@ public:
     }
 
 private:
+    bool isAvailableStatistic(Statistic _variable) override;
+
+    std::vector<PostProcessingVariable> getPostProcessingVariables(Statistic variable) override;
+
     void findPoints(Parameter* para, GridProvider* gridProvider, std::vector<int>& probeIndices_level,
                     std::vector<real>& distX_level, std::vector<real>& distY_level, std::vector<real>& distZ_level,      
                     std::vector<real>& pointCoordsX_level, std::vector<real>& pointCoordsY_level, std::vector<real>& pointCoordsZ_level,
                     int level) override;
-    void calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, int level) override;
+    void calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, uint t, int level) override;
 
 private:
     real posX, posY, posZ;
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.cu
index 76467d8da942cb189516571db66a473e5c4c32d5..7c0b5947a03330997678b55d7d8063685dca4e1c 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.cu
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.cu
@@ -1,3 +1,4 @@
+#include "Probe.h"
 #include "PointProbe.h"
 
 #include <cuda.h>
@@ -10,6 +11,64 @@
 #include "DataStructureInitializer/GridProvider.h"
 #include "GPU/CudaMemoryManager.h"
 
+bool PointProbe::isAvailableStatistic(Statistic _variable)
+{
+    bool isAvailable;
+    switch (_variable)
+    {
+        case Statistic::Instantaneous:
+        case Statistic::Means:
+        case Statistic::Variances:
+            isAvailable = true;
+            break;
+        case Statistic::SpatialMeans:
+        case Statistic::SpatioTemporalMeans:
+        case Statistic::SpatialCovariances:
+        case Statistic::SpatioTemporalCovariances:
+        case Statistic::SpatialSkewness:
+        case Statistic::SpatioTemporalSkewness:
+        case Statistic::SpatialFlatness:
+        case Statistic::SpatioTemporalFlatness:
+            isAvailable = false;
+            break;
+        default:
+            isAvailable = false;
+    }
+    return isAvailable;
+}
+
+std::vector<PostProcessingVariable> PointProbe::getPostProcessingVariables(Statistic statistic)
+{
+    std::vector<PostProcessingVariable> postProcessingVariables;
+    switch (statistic)
+    {
+    case Statistic::Instantaneous:
+        postProcessingVariables.push_back( PostProcessingVariable("vx",  velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vy",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vz",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("rho", this->densityRatio ) );
+        break;
+    case Statistic::Means:
+        postProcessingVariables.push_back( PostProcessingVariable("vx_mean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vy_mean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vz_mean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("rho_mean", this->densityRatio ) );
+        break;
+    case Statistic::Variances:
+        postProcessingVariables.push_back( PostProcessingVariable("vx_var",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vy_var",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("vz_var",  pow(this->velocityRatio, 2.0)) );
+        postProcessingVariables.push_back( PostProcessingVariable("rho_var", pow(this->densityRatio,  2.0)) );
+        break;
+
+    default:
+        printf("Statistic unavailable in PointProbe\n");
+        assert(false);
+        break;
+    }
+    return postProcessingVariables;
+}
+
 void PointProbe::findPoints(Parameter* para, GridProvider* gridProvider, std::vector<int>& probeIndices_level,
                        std::vector<real>& distX_level, std::vector<real>& distY_level, std::vector<real>& distZ_level,      
                        std::vector<real>& pointCoordsX_level, std::vector<real>& pointCoordsY_level, std::vector<real>& pointCoordsZ_level,
@@ -42,15 +101,14 @@ void PointProbe::findPoints(Parameter* para, GridProvider* gridProvider, std::ve
     }
 }
 
-void PointProbe::calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, int level)
+void PointProbe::calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, uint t, int level)
 {
     vf::cuda::CudaGrid grid = vf::cuda::CudaGrid(para->getParH(level)->numberofthreads, probeStruct->nPoints);
-
-    interpQuantities<<<grid.grid, grid.threads>>>(  probeStruct->pointIndicesD, probeStruct->nPoints, probeStruct->vals,
-                                                    probeStruct->distXD, probeStruct->distYD, probeStruct->distZD,
-                                                    para->getParD(level)->vx_SP, para->getParD(level)->vy_SP, para->getParD(level)->vz_SP, para->getParD(level)->rho_SP, 
-                                                    para->getParD(level)->neighborX_SP, para->getParD(level)->neighborY_SP, para->getParD(level)->neighborZ_SP, 
-                                                    probeStruct->quantitiesD, probeStruct->arrayOffsetsD, probeStruct->quantitiesArrayD, true);
+    interpAndCalcQuantitiesKernel<<<grid.grid, grid.threads>>>(  probeStruct->pointIndicesD, probeStruct->nPoints, probeStruct->vals,
+                                                probeStruct->distXD, probeStruct->distYD, probeStruct->distZD,
+                                                para->getParD(level)->vx_SP, para->getParD(level)->vy_SP, para->getParD(level)->vz_SP, para->getParD(level)->rho_SP, 
+                                                para->getParD(level)->neighborX_SP, para->getParD(level)->neighborY_SP, para->getParD(level)->neighborZ_SP, 
+                                                probeStruct->quantitiesD, probeStruct->arrayOffsetsD, probeStruct->quantitiesArrayD);
 }
 
 void PointProbe::addProbePointsFromList(std::vector<real>& _pointCoordsX, std::vector<real>& _pointCoordsY, std::vector<real>& _pointCoordsZ)
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h
index c64b2e592bd9b766d0a5bb1553c76d43e433b455..6a6fbe76f089acfafc22672dd3e9d71bd193a3b3 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/PointProbe.h
@@ -1,3 +1,41 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file PointProbe.h
+//! \author Henry Korb, Henrik Asmuth
+//! \date 13/05/2022
+//! \brief Probe computing statistics for a set of points in space
+//!
+//! The set of points can be defined by providing a list or on an x-normal plane (the latter being somewhat redundant with PlaneProbe)
+//! All statistics are temporal.
+//!
+//=======================================================================================
+
 #ifndef PointProbe_H
 #define PointProbe_H
 
@@ -10,25 +48,34 @@ public:
         const std::string _probeName,
         const std::string _outputPath,
         uint _tStartAvg,
+        uint _tAvg,
         uint _tStartOut,
         uint _tOut
     ): Probe(_probeName, 
              _outputPath,
              _tStartAvg, 
+             0,
+             _tAvg,
              _tStartOut, 
-             _tOut)
+             _tOut,
+             true,
+             false)
     {}
 
     void addProbePointsFromList(std::vector<real>& _pointCoordsX, std::vector<real>& _pointCoordsY, std::vector<real>& _pointCoordsZ);
     void addProbePointsFromXNormalPlane(real pos_x, real pos0_y, real pos0_z, real pos1_y, real pos1_z, uint n_y, uint n_z);
     
 private:
+    bool isAvailableStatistic(Statistic _variable) override;
+
+    std::vector<PostProcessingVariable> getPostProcessingVariables(Statistic variable) override;
+
     void findPoints(Parameter* para, GridProvider* gridProvider, std::vector<int>& probeIndices_level,
                     std::vector<real>& distX_level, std::vector<real>& distY_level, std::vector<real>& distZ_level,      
                     std::vector<real>& pointCoordsX_level, std::vector<real>& pointCoordsY_level, std::vector<real>& pointCoordsZ_level,
                     int level) override;
 
-    void calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, int level) override;
+    void calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, uint t, int level) override;
 
 private:
     std::vector<real> pointCoordsX, pointCoordsY, pointCoordsZ; 
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.cu
index 1875ef83b9bd388f16cf7d63fe4c3af2968a9113..17679f4ff5292f83f8a6758aa55e588db7042472 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.cu
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.cu
@@ -1,3 +1,35 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Probe.h
+//! \author Henry Korb, Henrik Asmuth
+//=======================================================================================
+
 #include "Probe.h"
 
 #include <cuda.h>
@@ -13,54 +45,25 @@
 #include "GPU/CudaMemoryManager.h"
 
 
-std::vector<std::string> getPostProcessingVariableNames(PostProcessingVariable variable)
-{
-    std::vector<std::string> varNames;
-    switch (variable)
-    {
-    case PostProcessingVariable::Instantaneous:
-        varNames.push_back("vx");
-        varNames.push_back("vy");
-        varNames.push_back("vz");
-        varNames.push_back("rho");
-        break;
-    case PostProcessingVariable::Means:
-        varNames.push_back("vx_mean");
-        varNames.push_back("vy_mean");
-        varNames.push_back("vz_mean");
-        varNames.push_back("rho_mean");
-        break;
-    case PostProcessingVariable::Variances:
-        varNames.push_back("vx_var");
-        varNames.push_back("vy_var");
-        varNames.push_back("vz_var");
-        varNames.push_back("rho_var");
-        break;
-    default:
-        break;
-    }
-    return varNames;
-}
-
-__device__ void calculateQuantities(uint n, real* quantityArray, bool* quantities, uint* quantityArrayOffsets, uint nPoints, uint node, real vx, real vy, real vz, real rho)
+__device__ void calculatePointwiseQuantities(uint n, real* quantityArray, bool* quantities, uint* quantityArrayOffsets, uint nPoints, uint node, real vx, real vy, real vz, real rho)
 {
     //"https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm"
     // also has extensions for higher order and covariances
     real inv_n = 1/real(n);
 
-    if(quantities[int(PostProcessingVariable::Instantaneous)])
+    if(quantities[int(Statistic::Instantaneous)])
     {
-        uint arrOff = quantityArrayOffsets[int(PostProcessingVariable::Instantaneous)];
+        uint arrOff = quantityArrayOffsets[int(Statistic::Instantaneous)];
         quantityArray[(arrOff+0)*nPoints+node] = vx;
         quantityArray[(arrOff+1)*nPoints+node] = vy;
         quantityArray[(arrOff+2)*nPoints+node] = vz;
         quantityArray[(arrOff+3)*nPoints+node] = rho;
     }
 
-    if(quantities[int(PostProcessingVariable::Means)])
+    if(quantities[int(Statistic::Means)])
     {
         
-        uint arrOff = quantityArrayOffsets[int(PostProcessingVariable::Means)];
+        uint arrOff = quantityArrayOffsets[int(Statistic::Means)];
         real vx_m_old  = quantityArray[(arrOff+0)*nPoints+node];
         real vy_m_old  = quantityArray[(arrOff+1)*nPoints+node];
         real vz_m_old  = quantityArray[(arrOff+2)*nPoints+node];
@@ -76,9 +79,9 @@ __device__ void calculateQuantities(uint n, real* quantityArray, bool* quantitie
         quantityArray[(arrOff+2)*nPoints+node] = vz_m_new;
         quantityArray[(arrOff+3)*nPoints+node] = rho_m_new;
     
-        if(quantities[int(PostProcessingVariable::Variances)])
+        if(quantities[int(Statistic::Variances)])
         {
-            arrOff = quantityArrayOffsets[int(PostProcessingVariable::Variances)];
+            arrOff = quantityArrayOffsets[int(Statistic::Variances)];
 
             real vx_var_old  = quantityArray[(arrOff+0)*nPoints+node];
             real vy_var_old  = quantityArray[(arrOff+1)*nPoints+node];
@@ -98,14 +101,12 @@ __device__ void calculateQuantities(uint n, real* quantityArray, bool* quantitie
     }
 }
 
-__global__ void interpQuantities(   uint* pointIndices,
+__global__ void calcQuantitiesKernel(   uint* pointIndices,
                                     uint nPoints, uint n,
-                                    real* distX, real* distY, real* distZ,
                                     real* vx, real* vy, real* vz, real* rho,            
                                     uint* neighborX, uint* neighborY, uint* neighborZ,
                                     bool* quantities,
-                                    uint* quantityArrayOffsets, real* quantityArray,
-                                    bool interpolate
+                                    uint* quantityArrayOffsets, real* quantityArray
                                 )
 {
     const uint x = threadIdx.x; 
@@ -124,35 +125,65 @@ __global__ void interpQuantities(   uint* pointIndices,
     uint k = pointIndices[node];
     real u_interpX, u_interpY, u_interpZ, rho_interp;
 
-    if(interpolate)
-    {
-        uint ke, kn, kt, kne, kte, ktn, ktne;
-        getNeighborIndicesOfBSW(  k, ke, kn, kt, kne, kte, ktn, ktne, neighborX, neighborY, neighborZ);
+    u_interpX = vx[k];
+    u_interpY = vy[k];
+    u_interpZ = vz[k];
+    rho_interp = rho[k];
 
-        // Trilinear interpolation of macroscopic quantities to probe point
-        real dW, dE, dN, dS, dT, dB;
-        getInterpolationWeights(dW, dE, dN, dS, dT, dB, distX[node], distY[node], distZ[node]);
+    calculatePointwiseQuantities(n, quantityArray, quantities, quantityArrayOffsets, nPoints, node, u_interpX, u_interpY, u_interpZ, rho_interp);
 
+}
 
-        u_interpX  = trilinearInterpolation( dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, vx );
-        u_interpY  = trilinearInterpolation( dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, vy );
-        u_interpZ  = trilinearInterpolation( dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, vz );
-        rho_interp = trilinearInterpolation( dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, rho );
-    }
-    else
-    {
-        u_interpX = vx[k];
-        u_interpY = vy[k];
-        u_interpZ = vz[k];
-        rho_interp = rho[k];
-    }
+__global__ void interpAndCalcQuantitiesKernel(   uint* pointIndices,
+                                    uint nPoints, uint n,
+                                    real* distX, real* distY, real* distZ,
+                                    real* vx, real* vy, real* vz, real* rho,            
+                                    uint* neighborX, uint* neighborY, uint* neighborZ,
+                                    bool* quantities,
+                                    uint* quantityArrayOffsets, real* quantityArray
+                                )
+{
+    const uint x = threadIdx.x; 
+    const uint y = blockIdx.x;
+    const uint z = blockIdx.y;
+
+    const uint nx = blockDim.x;
+    const uint ny = gridDim.x;
+
+    const uint node = nx*(ny*z + y) + x;
+
+    if(node>=nPoints) return;
+
+    // Get indices of neighbor nodes. 
+    // node referring to BSW cell as seen from probe point
+    uint k = pointIndices[node];
+    real u_interpX, u_interpY, u_interpZ, rho_interp;
+
+    uint ke, kn, kt, kne, kte, ktn, ktne;
+    getNeighborIndicesOfBSW(  k, ke, kn, kt, kne, kte, ktn, ktne, neighborX, neighborY, neighborZ);
+
+    // Trilinear interpolation of macroscopic quantities to probe point
+    real dW, dE, dN, dS, dT, dB;
+    getInterpolationWeights(dW, dE, dN, dS, dT, dB, distX[node], distY[node], distZ[node]);
 
-    calculateQuantities(n, quantityArray, quantities, quantityArrayOffsets, nPoints, node, u_interpX, u_interpY, u_interpZ, rho_interp);
+    u_interpX  = trilinearInterpolation( dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, vx );
+    u_interpY  = trilinearInterpolation( dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, vy );
+    u_interpZ  = trilinearInterpolation( dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, vz );
+    rho_interp = trilinearInterpolation( dW, dE, dN, dS, dT, dB, k, ke, kn, kt, kne, kte, ktn, ktne, rho );
+
+    calculatePointwiseQuantities(n, quantityArray, quantities, quantityArrayOffsets, nPoints, node, u_interpX, u_interpY, u_interpZ, rho_interp);
 
 }
 
+bool Probe::getHasDeviceQuantityArray(){ return this->hasDeviceQuantityArray; }
+
 void Probe::init(Parameter* para, GridProvider* gridProvider, CudaMemoryManager* cudaManager)
 {
+    this->velocityRatio      = para->getVelocityRatio();
+    this->densityRatio       = para->getDensityRatio();
+    this->forceRatio         = para->getForceRatio();
+    this->stressRatio        = para->getDensityRatio()*pow(para->getVelocityRatio(), 2.0);
+    this->accelerationRatio = para->getVelocityRatio()/para->getTimeRatio();
 
     probeParams.resize(para->getMaxLevel()+1);
 
@@ -184,7 +215,8 @@ void Probe::addProbeStruct(CudaMemoryManager* cudaManager, std::vector<int>& pro
 {
     probeParams[level] = SPtr<ProbeStruct>(new ProbeStruct);
     probeParams[level]->vals = 1;
-    probeParams[level]->nPoints = uint(probeIndices.size());
+    probeParams[level]->nPoints  = uint(pointCoordsX.size()); // Note, need to have both nPoints and nIndices because they differ in PlanarAverage
+    probeParams[level]->nIndices = uint(probeIndices.size());
 
     probeParams[level]->pointCoordsX = (real*)malloc(probeParams[level]->nPoints*sizeof(real));
     probeParams[level]->pointCoordsY = (real*)malloc(probeParams[level]->nPoints*sizeof(real));
@@ -194,30 +226,34 @@ void Probe::addProbeStruct(CudaMemoryManager* cudaManager, std::vector<int>& pro
     std::copy(pointCoordsY.begin(), pointCoordsY.end(), probeParams[level]->pointCoordsY);
     std::copy(pointCoordsZ.begin(), pointCoordsZ.end(), probeParams[level]->pointCoordsZ);
 
-    // Might have to catch nPoints=0 ?!?!
-    cudaManager->cudaAllocProbeDistances(this, level);
+    // Note, dist only needed for kernels that do interpolate
+    if( distX.size()>0 && distY.size()>0 && distZ.size()>0 )
+    {
+        probeParams[level]->hasDistances=true;
+        cudaManager->cudaAllocProbeDistances(this, level);
+        std::copy(distX.begin(), distX.end(), probeParams[level]->distXH);
+        std::copy(distY.begin(), distY.end(), probeParams[level]->distYH);
+        std::copy(distZ.begin(), distZ.end(), probeParams[level]->distZH);
+        cudaManager->cudaCopyProbeDistancesHtoD(this, level);
+    }  
+    
     cudaManager->cudaAllocProbeIndices(this, level);
-
-    std::copy(distX.begin(), distX.end(), probeParams[level]->distXH);
-    std::copy(distY.begin(), distY.end(), probeParams[level]->distYH);
-    std::copy(distZ.begin(), distZ.end(), probeParams[level]->distZH);
     std::copy(probeIndices.begin(), probeIndices.end(), probeParams[level]->pointIndicesH);
-
-    cudaManager->cudaCopyProbeDistancesHtoD(this, level);
     cudaManager->cudaCopyProbeIndicesHtoD(this, level);
 
     uint arrOffset = 0;
 
     cudaManager->cudaAllocProbeQuantitiesAndOffsets(this, level);
 
-    for( int var=0; var<int(PostProcessingVariable::LAST); var++){
-    if(this->quantities[var])
+    for( int var=0; var<int(Statistic::LAST); var++)
     {
-
-        probeParams[level]->quantitiesH[var] = true;
-        probeParams[level]->arrayOffsetsH[var] = arrOffset;
-        arrOffset += uint(getPostProcessingVariableNames(static_cast<PostProcessingVariable>(var)).size());
-    }}
+        if(this->quantities[var])
+        {
+            probeParams[level]->quantitiesH[var] = true;
+            probeParams[level]->arrayOffsetsH[var] = arrOffset;
+            arrOffset += uint( this->getPostProcessingVariables(static_cast<Statistic>(var)).size() ); 
+        }
+    }
     
     cudaManager->cudaCopyProbeQuantitiesAndOffsetsHtoD(this, level);
 
@@ -232,53 +268,67 @@ void Probe::addProbeStruct(CudaMemoryManager* cudaManager, std::vector<int>& pro
             probeParams[level]->quantitiesArrayH[arr*probeParams[level]->nPoints+point] = 0.0f;
         }
     }
-    cudaManager->cudaCopyProbeQuantityArrayHtoD(this, level);
+    if(this->hasDeviceQuantityArray)
+        cudaManager->cudaCopyProbeQuantityArrayHtoD(this, level);
 }
 
 void Probe::interact(Parameter* para, CudaMemoryManager* cudaManager, int level, uint t)
 {
-
-    if(t>this->tStartAvg)
+    if(max(int(t) - int(this->tStartAvg), -1) % this->tAvg==0)
     {
         SPtr<ProbeStruct> probeStruct = this->getProbeStruct(level);
 
-        this->calculateQuantities(probeStruct, para, level);
-        probeStruct->vals++;
+        this->calculateQuantities(probeStruct, para, t, level);
+        if(t>=this->tStartTmpAveraging) probeStruct->vals++;
+    }
 
-        if(max(int(t) - int(this->tStartOut), -1) % this->tOut == 0)
-        {
+    if(max(int(t) - int(this->tStartOut), -1) % this->tOut == 0)
+    {
+        if(this->hasDeviceQuantityArray)
             cudaManager->cudaCopyProbeQuantityArrayDtoH(this, level);
-
-            this->write(para, level, t);
-        }
-
+        this->write(para, level, t);
     }
 }
 
 void Probe::free(Parameter* para, CudaMemoryManager* cudaManager)
 {
     for(int level=0; level<=para->getMaxLevel(); level++)
-    {
-        cudaManager->cudaFreeProbeDistances(this, level);
+    {   
+        if(this->probeParams[level]->hasDistances)
+            cudaManager->cudaFreeProbeDistances(this, level);
         cudaManager->cudaFreeProbeIndices(this, level);
         cudaManager->cudaFreeProbeQuantityArray(this, level);
         cudaManager->cudaFreeProbeQuantitiesAndOffsets(this, level);
     }
 }
 
-void Probe::addPostProcessingVariable(PostProcessingVariable variable)
+void Probe::addStatistic(Statistic variable)
 {
+    assert(this->isAvailableStatistic(variable));
+
     this->quantities[int(variable)] = true;
     switch(variable)
     {
-        case PostProcessingVariable::Variances: 
-            this->addPostProcessingVariable(PostProcessingVariable::Means); break;
+        case Statistic::Variances: 
+            this->addStatistic(Statistic::Means); break;
+
         default: break;
     }
 }
 
+void Probe::addAllAvailableStatistics()
+{
+    for( int var=0; var < int(Statistic::LAST); var++)
+    {
+        if(this->isAvailableStatistic(static_cast<Statistic>(var))) 
+            this->addStatistic(static_cast<Statistic>(var));
+    }
+}
+
 void Probe::write(Parameter* para, int level, int t)
 {
+    int t_write = this->fileNameLU ? t: t/this->tOut; 
+
     const uint numberOfParts = this->getProbeStruct(level)->nPoints / para->getlimitOfNodesForVTK() + 1;
 
     std::vector<std::string> fnames;
@@ -286,21 +336,22 @@ void Probe::write(Parameter* para, int level, int t)
 	{
         std::string fname = this->probeName + "_bin_lev_" + StringUtil::toString<int>(level)
                                          + "_ID_" + StringUtil::toString<int>(para->getMyID())
-                                         + "_Part_" + StringUtil::toString<int>(i) 
-                                         + "_t_" + StringUtil::toString<int>(t) 
-                                         + ".vtk";
+                                         + "_Part_" + StringUtil::toString<int>(i);
+        if(!this->outputTimeSeries) fname += "_t_" + StringUtil::toString<int>(t_write);
+        fname += ".vtk";
 		fnames.push_back(fname);
         this->fileNamesForCollectionFile.push_back(fname);
     }
     this->writeGridFiles(para, level, fnames, t);
 
-    if(level == 0) this->writeCollectionFile(para, t);
+    if(level == 0 && !this->outputTimeSeries) this->writeCollectionFile(para, t);
 }
 
 void Probe::writeCollectionFile(Parameter* para, int t)
 {
+    int t_write = this->fileNameLU ? t: t/this->tOut; 
     std::string filename = this->probeName + "_bin_ID_" + StringUtil::toString<int>(para->getMyID()) 
-                                           + "_t_" + StringUtil::toString<int>(t) 
+                                           + "_t_" + StringUtil::toString<int>(t_write) 
                                            + ".vtk";
 
     std::ofstream file;
@@ -314,7 +365,7 @@ void Probe::writeCollectionFile(Parameter* para, int t)
 
     file << "    <PPointData>" << std::endl;
 
-    for(std::string varName: this->getVarNames())
+    for(std::string varName: this->getVarNames()) //TODO
     {
         file << "       <DataArray type=\"Float64\" Name=\""<< varName << "\" /> " << std::endl;
     }
@@ -355,7 +406,8 @@ void Probe::writeGridFiles(Parameter* para, int level, std::vector<std::string>&
     for (uint part = 0; part < fnames.size(); part++)
     {        
         startpos = part * para->getlimitOfNodesForVTK();
-        sizeOfNodes = min(para->getlimitOfNodesForVTK(), probeStruct->nPoints - startpos);
+        uint nDataPoints = this->outputTimeSeries? this->tProbe: probeStruct->nPoints;
+        sizeOfNodes = min(para->getlimitOfNodesForVTK(), nDataPoints - startpos);
         endpos = startpos + sizeOfNodes;
 
         //////////////////////////////////////////////////////////////////////////
@@ -370,38 +422,29 @@ void Probe::writeGridFiles(Parameter* para, int level, std::vector<std::string>&
 
         for( auto it=nodedata.begin(); it!=nodedata.end(); it++) it->resize(sizeOfNodes);
 
-        for( int var=0; var < int(PostProcessingVariable::LAST); var++){
-        if(this->quantities[var])
-        {
-            PostProcessingVariable quantity = static_cast<PostProcessingVariable>(var);
-            real coeff;
-            uint n_arrs = uint(getPostProcessingVariableNames(quantity).size());
-
-            switch(quantity)
+        for( int var=0; var < int(Statistic::LAST); var++){           
+            if(this->quantities[var])
             {
-            case PostProcessingVariable::Instantaneous:
-                coeff = para->getVelocityRatio();
-            break;
-            case PostProcessingVariable::Means:
-                coeff = para->getVelocityRatio();
-            break;
-            case PostProcessingVariable::Variances:
-                coeff = pow(para->getVelocityRatio(),2);
-            break;
-            default: break;
-            }
+                Statistic statistic = static_cast<Statistic>(var);
+                real coeff;
 
-            uint arrOff = probeStruct->arrayOffsetsH[var];
-            uint arrLen = probeStruct->nPoints;
+                std::vector<PostProcessingVariable> postProcessingVariables = this->getPostProcessingVariables(statistic);
+                uint n_arrs = uint(postProcessingVariables.size());
 
-            for(uint arr=0; arr<n_arrs; arr++)
-            {
-                for (uint pos = startpos; pos < endpos; pos++)
+                uint arrOff = probeStruct->arrayOffsetsH[var];
+                uint arrLen = probeStruct->nPoints;
+
+                for(uint arr=0; arr<n_arrs; arr++)
                 {
-                    nodedata[arrOff+arr][pos-startpos] = double(probeStruct->quantitiesArrayH[(arrOff+arr)*arrLen+pos]*coeff);
+                    coeff = postProcessingVariables[arr].conversionFactor;
+                    
+                    for (uint pos = startpos; pos < endpos; pos++)
+                    {
+                        nodedata[arrOff+arr][pos-startpos] = double(probeStruct->quantitiesArrayH[(arrOff+arr)*arrLen+pos]*coeff);
+                    }
                 }
             }
-        }}
+        }
         WbWriterVtkXmlBinary::getInstance()->writeNodesWithNodeData(this->outputPath + "/" + fnames[part], nodes, nodedatanames, nodedata);
     }
 }
@@ -409,11 +452,14 @@ void Probe::writeGridFiles(Parameter* para, int level, std::vector<std::string>&
 std::vector<std::string> Probe::getVarNames()
 {
     std::vector<std::string> varNames;
-    for( int var=0; var < int(PostProcessingVariable::LAST); var++){
-    if(this->quantities[var])
+    for( int statistic=0; statistic < int(Statistic::LAST); statistic++)
     {
-        std::vector<std::string> names = getPostProcessingVariableNames(static_cast<PostProcessingVariable>(var));
-        varNames.insert(varNames.end(), names.begin(), names.end());
-    }}
+        if(this->quantities[statistic])
+        {
+            std::vector<PostProcessingVariable> postProcessingVariables = this->getPostProcessingVariables(static_cast<Statistic>(statistic));            
+            for(int i = 0; i<postProcessingVariables.size(); i++) 
+                varNames.push_back(postProcessingVariables[i].name);
+        }
+    }
     return varNames;
-}
+}
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h
index 988d1817e3b19bbfa5c25ed8d271a105ff433de9..d030d0c7a7344a44933b8114b7cd39c7ade3bf30 100644
--- a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/Probe.h
@@ -1,3 +1,45 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file Probe.h
+//! \author Henry Korb, Henrik Asmuth
+//! \date 13/05/2022
+//! \brief Base class for probes called in UpdateGrid27
+//!
+//! Any probe should be initiated in the app and added via para->addProbe( someProbe )
+//! Note, that all probes generally require that macroscopic variables have been updated in the 
+//! time step they are called in. Most collision kernels (atm, all except TurbulentViscosityCumulantK17CompChim )
+//! don't do this and would require an explicit call of calcMacroscopicQuantities. It does seem quite 
+//! inexpensive though to simply save vx, vy, etc., directly in the collider.
+//!
+//! \todo might have to adapt conversionFactors when using grid refinement
+//=======================================================================================
+
 #ifndef Probe_H
 #define Probe_H
 
@@ -6,36 +48,80 @@
 #include "PreCollisionInteractor/PreCollisionInteractor.h"
 #include "PointerDefinitions.h"
 
-enum class PostProcessingVariable{ 
-    // HowTo add new PostProcessingVariable: Add enum here, LAST has to stay last
-    // In interpQuantities add computation of quantity in switch statement
-    // In writeGridFiles add lb->rw conversion factor
-    // In getPostProcessingVariableNames add names
-    // If new quantity depends on other quantities i.e. mean, catch in addPostProcessingVariable
+//=======================================================================================
+//! \note How to add new Statistics 
+//! Generally, the Statistic enum refers to the type of statistic to be calculated. 
+//! It then depends on the derived probe class, which of these statistics are available. 
+//! Some type of statistics are only suitable for a certain probe class, others might 
+//! simply not have been implemented, yet.
+//! For the same reasons it is also probe-specific, for which quantities (e.g. velocities, rho, etc.) these statistics are computed. 
+//! The specific quantity (e.g., mean of vx, or variance of rho) is defined as PostProcessingVariable in getPostProcessingVariables of each respective probe.
+//! PostProcessingVariable also holds the name and conversionFactor of the quantity that is required when writing the data to file
+//! 
+//! To add new Statistics:
+//!     1. Add enum here, LAST has to stay last
+//!     2. For PointProbe and PlaneProbe: add the computation of the statistic in switch statement in calculatePointwiseQuantities. 
+//!     3. For PlanarAverageProbe and WallModelProbe: add the computation directly in calculateQuantities.
+//!     4. In getPostProcessingVariables add the static in the switch statement and add the corresponding PostProcessingVariables
+//!     5. Add Statistic to isAvailableStatistic of the respective probe
+//!
+//!  When adding new quantities to existing statistics (e.g., add rho to PlanarAverageProbe which currently only computes stats of velocity) only do steps 2 to 4
+//!
+
+enum class Statistic{ 
+    // Variables currently available in Point and Plane probe (all temporal pointwise statistics)
     Instantaneous,
     Means,
     Variances,
+
+    // Variables available in PlanarAverage probe and (partially) in WallModelProbe
+    // Spatial statistics are typically computed across fixed spatial subdomains, e.g. a plane of constant height
+    // Spatio-temporal statistics additionally average the spatial stats in time
+    SpatialMeans,
+    SpatioTemporalMeans,
+    SpatialCovariances,
+    SpatioTemporalCovariances,
+    SpatialSkewness,
+    SpatioTemporalSkewness,
+    SpatialFlatness,
+    SpatioTemporalFlatness,
     LAST,
 };
 
+typedef struct PostProcessingVariable{
+    std::string name;
+    real conversionFactor;
+    PostProcessingVariable( std::string _name, 
+                            real        _conversionFactor): 
+    name(_name), conversionFactor(_conversionFactor){};
+} PostProcessingVariable;
+
 struct ProbeStruct{
-    uint nPoints, nArrays, vals;
+    uint nPoints, nIndices, nArrays, vals;
     uint *pointIndicesH, *pointIndicesD;
     real *pointCoordsX, *pointCoordsY, *pointCoordsZ;
+    bool hasDistances=false;
     real *distXH, *distYH, *distZH, *distXD, *distYD, *distZD;
     real *quantitiesArrayH, *quantitiesArrayD;
     bool *quantitiesH, *quantitiesD;
     uint *arrayOffsetsH, *arrayOffsetsD;
 };
 
-__global__ void interpQuantities(   uint* pointIndices,
+__global__ void calcQuantitiesKernel(   uint* pointIndices,
+                                    uint nPoints, uint n,
+                                    real* vx, real* vy, real* vz, real* rho,            
+                                    uint* neighborX, uint* neighborY, uint* neighborZ,
+                                    bool* quantities,
+                                    uint* quantityArrayOffsets, real* quantityArray
+                                );
+
+__global__ void interpAndCalcQuantitiesKernel(   uint* pointIndices,
                                     uint nPoints, uint n,
                                     real* distX, real* distY, real* distZ,
                                     real* vx, real* vy, real* vz, real* rho,            
                                     uint* neighborX, uint* neighborY, uint* neighborZ,
                                     bool* quantities,
-                                    uint* quantityArrayOffsets, real* quantityArray,
-                                    bool interpolate
+                                    uint* quantityArrayOffsets, real* quantityArray
                                 );
 
 
@@ -46,13 +132,21 @@ public:
         const std::string _probeName,
         const std::string _outputPath,
         uint _tStartAvg,
+        uint _tStartTmpAvg,
+        uint _tAvg,
         uint _tStartOut,
-        uint _tOut
+        uint _tOut,
+        bool _hasDeviceQuantityArray,
+        bool _outputTimeSeries
     ):  probeName(_probeName),
         outputPath(_outputPath),
         tStartAvg(_tStartAvg),
+        tStartTmpAveraging(_tStartTmpAvg),
+        tAvg(_tAvg),
         tStartOut(_tStartOut),
         tOut(_tOut),
+        hasDeviceQuantityArray(_hasDeviceQuantityArray),
+        outputTimeSeries(_outputTimeSeries),        
         PreCollisionInteractor()
     {
         assert("Output starts before averaging!" && tStartOut>=tStartAvg);
@@ -64,9 +158,20 @@ public:
 
     SPtr<ProbeStruct> getProbeStruct(int level){ return this->probeParams[level]; }
 
-    void addPostProcessingVariable(PostProcessingVariable _variable);
+    void addStatistic(Statistic _variable);
+    void addAllAvailableStatistics();
+    
+    bool getHasDeviceQuantityArray();
+    uint getTStartTmpAveraging(){return this->tStartTmpAveraging;}
+
+    void setFileNameToNOut(){this->fileNameLU = false;}
+    void setTStartTmpAveraging(uint _tStartTmpAveraging){this->tStartTmpAveraging = _tStartTmpAveraging;}
 
 private:
+    virtual bool isAvailableStatistic(Statistic _variable) = 0;
+
+    virtual std::vector<PostProcessingVariable> getPostProcessingVariables(Statistic variable) = 0;
+
     virtual void findPoints(Parameter* para, GridProvider* gridProvider, std::vector<int>& probeIndices_level,
                        std::vector<real>& distX_level, std::vector<real>& distY_level, std::vector<real>& distZ_level,      
                        std::vector<real>& pointCoordsX_level, std::vector<real>& pointCoordsY_level, std::vector<real>& pointCoordsZ_level,
@@ -75,7 +180,7 @@ private:
                         std::vector<real>& distX, std::vector<real>& distY, std::vector<real>& distZ,   
                         std::vector<real>& pointCoordsX, std::vector<real>& pointCoordsY, std::vector<real>& pointCoordsZ,
                         int level);
-    virtual void calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, int level) = 0;
+    virtual void calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, uint t, int level) = 0;
 
     void write(Parameter* para, int level, int t);
     void writeCollectionFile(Parameter* para, int t);
@@ -87,13 +192,28 @@ private:
     const std::string outputPath;
 
     std::vector<SPtr<ProbeStruct>> probeParams;
-    bool quantities[int(PostProcessingVariable::LAST)] = {};
+    bool quantities[int(Statistic::LAST)] = {};
+    bool hasDeviceQuantityArray;    //!> flag initiating memCopy in Point and PlaneProbe. Other probes are only based on thrust reduce functions and therefore dont need explict memCopy in interact()
+    bool outputTimeSeries;          //!> flag initiating overwrite of output vtk files, skipping collection files and limiting the length of the written data to the current time step (currently only used for WallModelProbe)
     std::vector<std::string> fileNamesForCollectionFile;
     std::vector<std::string> varNames;
 
+    bool fileNameLU = true; //!> if true, written file name contains time step in LU, else is the number of the written probe files
+
+protected:
     uint tStartAvg;
+    uint tStartTmpAveraging; //!> only non-zero in PlanarAverageProbe and WallModelProbe to switch on Spatio-temporal averaging (while only doing spatial averaging for t<tStartTmpAveraging) 
+    uint tAvg;
     uint tStartOut;
     uint tOut;
+
+    uint tProbe = 0; //!> counter for number of probe evaluations. Only used when outputting timeseries
+
+    real velocityRatio;
+    real densityRatio;
+    real forceRatio;
+    real stressRatio;
+    real accelerationRatio;
 };
 
 #endif
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.cu b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.cu
new file mode 100644
index 0000000000000000000000000000000000000000..15327beef059f298ec7dacc663f4f986fb577c5a
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.cu
@@ -0,0 +1,300 @@
+#include "Probe.h"
+#include "WallModelProbe.h"
+
+#include <cuda/CudaGrid.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/device_ptr.h>
+#include <thrust/inner_product.h>
+
+#include "Parameter/Parameter.h"
+#include "DataStructureInitializer/GridProvider.h"
+#include "GPU/CudaMemoryManager.h"
+
+
+///////////////////////////////////////////////////////////////////////////////////
+/// Functors for thrust reductions
+///////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct pow2 : public thrust::unary_function<T,T>
+{
+  __host__ __device__ T operator()(const T &x) const
+  {
+    return x * x;
+  }
+};
+
+template<typename T>
+struct pow3 : public thrust::unary_function<T,T>
+{
+  __host__ __device__ T operator()(const T &x) const
+  {
+    return x * x * x;
+  }
+};
+
+template<typename T>
+struct pow4 : public thrust::unary_function<T,T>
+{
+  __host__ __device__ T operator()(const T &x) const
+  {
+    return x * x * x * x;
+  }
+};
+
+struct nth_moment
+{
+    const float mean;
+    const int n;
+
+    nth_moment(float _mean, int _n) : mean(_mean), n(_n) {}
+
+    __host__ __device__
+        float operator()(const float& x) const { 
+            
+            real fluctuation = x-mean;
+            real moment = fluctuation;
+            for(int i = 1; i<n; i++) moment *= fluctuation;
+            
+            return moment;
+        }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////
+bool WallModelProbe::isAvailableStatistic(Statistic _variable)
+{
+    bool isAvailable;
+
+    switch (_variable)
+    {
+        case Statistic::Instantaneous:
+        case Statistic::Means:
+        case Statistic::Variances:
+            isAvailable = false;
+            break;
+        case Statistic::SpatialMeans:
+        case Statistic::SpatioTemporalMeans:
+            isAvailable =  true;
+            break;
+        case Statistic::SpatialCovariances:
+        case Statistic::SpatioTemporalCovariances:
+        case Statistic::SpatialSkewness:
+        case Statistic::SpatioTemporalSkewness:
+        case Statistic::SpatialFlatness:
+        case Statistic::SpatioTemporalFlatness:
+            isAvailable =  false;
+            break;
+        default:
+            isAvailable =  false;
+    }
+    return isAvailable;
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+
+std::vector<PostProcessingVariable> WallModelProbe::getPostProcessingVariables(Statistic statistic)
+{
+    std::vector<PostProcessingVariable> postProcessingVariables;
+    switch (statistic)
+    {
+    case Statistic::SpatialMeans:
+        postProcessingVariables.push_back( PostProcessingVariable("vx_el_spatMean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vy_el_spatMean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vz_el_spatMean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vx1_spatMean",    this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vy1_spatMean",    this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vz1_spatMean",    this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("u_star_spatMean", this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("Fx_spatMean",     this->outputStress? this->stressRatio: this->forceRatio) ); 
+        postProcessingVariables.push_back( PostProcessingVariable("Fy_spatMean",     this->outputStress? this->stressRatio: this->forceRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("Fz_spatMean",     this->outputStress? this->stressRatio: this->forceRatio) );
+        if(this->evaluatePressureGradient)
+        {
+            postProcessingVariables.push_back( PostProcessingVariable("dpdx_spatMean",     this->accelerationRatio) ); 
+            postProcessingVariables.push_back( PostProcessingVariable("dpdy_spatMean",     this->accelerationRatio) );
+            postProcessingVariables.push_back( PostProcessingVariable("dpdz_spatMean",     this->accelerationRatio) );
+        }
+        break;
+    case Statistic::SpatioTemporalMeans:
+        postProcessingVariables.push_back( PostProcessingVariable("vx_el_spatTmpMean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vy_el_spatTmpMean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vz_el_spatTmpMean",  this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vx1_spatTmpMean",    this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vy1_spatTmpMean",    this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("vz1_spatTmpMean",    this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("u_star_spatTmpMean", this->velocityRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("Fx_spatTmpMean",     this->outputStress? this->stressRatio: this->forceRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("Fy_spatTmpMean",     this->outputStress? this->stressRatio: this->forceRatio) );
+        postProcessingVariables.push_back( PostProcessingVariable("Fz_spatTmpMean",     this->outputStress? this->stressRatio: this->forceRatio) );
+        if(this->evaluatePressureGradient)
+        {
+            postProcessingVariables.push_back( PostProcessingVariable("dpdx_spatTmpMean",     this->accelerationRatio) ); 
+            postProcessingVariables.push_back( PostProcessingVariable("dpdy_spatTmpMean",     this->accelerationRatio) );
+            postProcessingVariables.push_back( PostProcessingVariable("dpdz_spatTmpMean",     this->accelerationRatio) );
+        }
+        break;
+
+    default:
+        printf("Statistic unavailable in WallModelProbe\n");
+        assert(false);
+        break;
+    }
+    return postProcessingVariables;
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+
+void WallModelProbe::findPoints(Parameter* para, GridProvider* gridProvider, std::vector<int>& probeIndices_level,
+                            std::vector<real>& distX_level, std::vector<real>& distY_level, std::vector<real>& distZ_level,      
+                            std::vector<real>& pointCoordsX_level, std::vector<real>& pointCoordsY_level, std::vector<real>& pointCoordsZ_level,
+                            int level)
+{
+    assert( para->getParD(level)->kStressQ > 0 && para->gethasWallModelMonitor() );
+
+    real dt = para->getTimeRatio();
+    uint nt = uint((para->getTEnd()-this->tStartAvg)/this->tAvg);
+    
+    for(uint t=0; t<nt; t++)
+    {
+        pointCoordsX_level.push_back(dt*(t*this->tAvg)+this->tStartAvg); // x coord will serve as time in this probe
+        pointCoordsY_level.push_back(0);
+        pointCoordsZ_level.push_back(0);
+    }
+
+    if(this->evaluatePressureGradient)
+    {
+        assert(para->getIsBodyForce());
+        // Find all fluid nodes
+        for(uint j=1; j<para->getParH(level)->size_Mat_SP; j++ )
+        {
+            if( para->getParH(level)->geoSP[j] == GEO_FLUID) 
+            {
+                probeIndices_level.push_back(j);
+            }
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+
+void WallModelProbe::calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, uint t, int level)
+{   
+    bool doTmpAveraging = (t>this->getTStartTmpAveraging());
+
+    // Pointer casts to use device arrays in thrust reductions
+    thrust::device_ptr<real> u_el_thrust    = thrust::device_pointer_cast(para->getParD(level)->QStress.Vx);
+    thrust::device_ptr<real> v_el_thrust    = thrust::device_pointer_cast(para->getParD(level)->QStress.Vy);
+    thrust::device_ptr<real> w_el_thrust    = thrust::device_pointer_cast(para->getParD(level)->QStress.Vz);
+    thrust::device_ptr<real> u1_thrust      = thrust::device_pointer_cast(para->getParD(level)->QStress.Vx1);
+    thrust::device_ptr<real> v1_thrust      = thrust::device_pointer_cast(para->getParD(level)->QStress.Vy1);
+    thrust::device_ptr<real> w1_thrust      = thrust::device_pointer_cast(para->getParD(level)->QStress.Vz1);
+    thrust::device_ptr<real> u_star_thrust  = thrust::device_pointer_cast(para->getParD(level)->wallModel.u_star);
+    thrust::device_ptr<real> Fx_thrust      = thrust::device_pointer_cast(para->getParD(level)->wallModel.Fx);
+    thrust::device_ptr<real> Fy_thrust      = thrust::device_pointer_cast(para->getParD(level)->wallModel.Fy);
+    thrust::device_ptr<real> Fz_thrust      = thrust::device_pointer_cast(para->getParD(level)->wallModel.Fz);
+    thrust::device_ptr<real> dpdx_thrust    = thrust::device_pointer_cast(para->getParD(level)->forceX_SP);
+    thrust::device_ptr<real> dpdy_thrust    = thrust::device_pointer_cast(para->getParD(level)->forceY_SP);
+    thrust::device_ptr<real> dpdz_thrust    = thrust::device_pointer_cast(para->getParD(level)->forceZ_SP);
+
+    thrust::device_ptr<uint> indices_thrust = thrust::device_pointer_cast(probeStruct->pointIndicesD);
+    typedef thrust::device_vector<real>::iterator valIterator;
+    typedef thrust::device_vector<uint>::iterator indIterator;
+    thrust::permutation_iterator<valIterator, indIterator> dpdx_iter_begin(dpdx_thrust, indices_thrust);
+    thrust::permutation_iterator<valIterator, indIterator> dpdx_iter_end  (dpdx_thrust, indices_thrust+probeStruct->nIndices);
+    thrust::permutation_iterator<valIterator, indIterator> dpdy_iter_begin(dpdy_thrust, indices_thrust);
+    thrust::permutation_iterator<valIterator, indIterator> dpdy_iter_end  (dpdy_thrust, indices_thrust+probeStruct->nIndices);
+    thrust::permutation_iterator<valIterator, indIterator> dpdz_iter_begin(dpdz_thrust, indices_thrust);
+    thrust::permutation_iterator<valIterator, indIterator> dpdz_iter_end  (dpdz_thrust, indices_thrust+probeStruct->nIndices);
+
+    real N = para->getParD(level)->kStressQ;
+    real n = (real)probeStruct->vals;
+    int nPoints = probeStruct->nPoints;
+
+    if(probeStruct->quantitiesH[int(Statistic::SpatialMeans)])
+    {
+        // Compute the instantaneous spatial means of the velocity moments 
+        real spatMean_u_el      = thrust::reduce(u_el_thrust, u_el_thrust+N)/N;
+        real spatMean_v_el      = thrust::reduce(v_el_thrust, v_el_thrust+N)/N;
+        real spatMean_w_el      = thrust::reduce(w_el_thrust, w_el_thrust+N)/N;
+        real spatMean_u1        = thrust::reduce(u1_thrust, u1_thrust+N)/N;
+        real spatMean_v1        = thrust::reduce(v1_thrust, v1_thrust+N)/N;
+        real spatMean_w1        = thrust::reduce(w1_thrust, w1_thrust+N)/N;
+        real spatMean_u_star    = thrust::reduce(u_star_thrust, u_star_thrust+N)/N;
+        real spatMean_Fx        = thrust::reduce(Fx_thrust, Fx_thrust+N)/N;
+        real spatMean_Fy        = thrust::reduce(Fy_thrust, Fy_thrust+N)/N;
+        real spatMean_Fz        = thrust::reduce(Fz_thrust, Fz_thrust+N)/N;
+
+        uint arrOff = probeStruct->arrayOffsetsH[int(Statistic::SpatialMeans)];
+        probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+tProbe] = spatMean_u_el;
+        probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+tProbe] = spatMean_v_el;
+        probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+tProbe] = spatMean_w_el;
+        probeStruct->quantitiesArrayH[(arrOff+3)*nPoints+tProbe] = spatMean_u1;
+        probeStruct->quantitiesArrayH[(arrOff+4)*nPoints+tProbe] = spatMean_v1;
+        probeStruct->quantitiesArrayH[(arrOff+5)*nPoints+tProbe] = spatMean_w1;
+        probeStruct->quantitiesArrayH[(arrOff+6)*nPoints+tProbe] = spatMean_u_star;
+        probeStruct->quantitiesArrayH[(arrOff+7)*nPoints+tProbe] = spatMean_Fx;
+        probeStruct->quantitiesArrayH[(arrOff+8)*nPoints+tProbe] = spatMean_Fy;
+        probeStruct->quantitiesArrayH[(arrOff+9)*nPoints+tProbe] = spatMean_Fz;
+
+        real spatMean_dpdx;
+        real spatMean_dpdy;
+        real spatMean_dpdz;
+        if(this->evaluatePressureGradient)
+        {
+            real N_fluid = (real)probeStruct->nIndices;
+            spatMean_dpdx      = thrust::reduce(dpdx_iter_begin, dpdx_iter_end)/N_fluid;
+            spatMean_dpdy      = thrust::reduce(dpdy_iter_begin, dpdy_iter_end)/N_fluid;
+            spatMean_dpdz      = thrust::reduce(dpdz_iter_begin, dpdz_iter_end)/N_fluid;
+            probeStruct->quantitiesArrayH[(arrOff+10)*nPoints+tProbe] = spatMean_dpdx;
+            probeStruct->quantitiesArrayH[(arrOff+11)*nPoints+tProbe] = spatMean_dpdy;
+            probeStruct->quantitiesArrayH[(arrOff+12)*nPoints+tProbe] = spatMean_dpdz;
+        }
+
+        if(probeStruct->quantitiesH[int(Statistic::SpatioTemporalMeans)] && doTmpAveraging)
+        {
+            uint arrOff = probeStruct->arrayOffsetsH[int(Statistic::SpatioTemporalMeans)];
+            real spatMean_u_el_old      = probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+tProbe-1];
+            real spatMean_v_el_old      = probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+tProbe-1];
+            real spatMean_w_el_old      = probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+tProbe-1];
+            real spatMean_u1_old        = probeStruct->quantitiesArrayH[(arrOff+3)*nPoints+tProbe-1];
+            real spatMean_v1_old        = probeStruct->quantitiesArrayH[(arrOff+4)*nPoints+tProbe-1];
+            real spatMean_w1_old        = probeStruct->quantitiesArrayH[(arrOff+5)*nPoints+tProbe-1];
+            real spatMean_u_star_old    = probeStruct->quantitiesArrayH[(arrOff+6)*nPoints+tProbe-1];
+            real spatMean_Fx_old        = probeStruct->quantitiesArrayH[(arrOff+7)*nPoints+tProbe-1];
+            real spatMean_Fy_old        = probeStruct->quantitiesArrayH[(arrOff+8)*nPoints+tProbe-1];
+            real spatMean_Fz_old        = probeStruct->quantitiesArrayH[(arrOff+9)*nPoints+tProbe-1];
+
+            probeStruct->quantitiesArrayH[(arrOff+0)*nPoints+tProbe] = spatMean_u_el_old + (spatMean_u_el-spatMean_u_el_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+1)*nPoints+tProbe] = spatMean_v_el_old + (spatMean_v_el-spatMean_v_el_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+2)*nPoints+tProbe] = spatMean_w_el_old + (spatMean_w_el-spatMean_w_el_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+3)*nPoints+tProbe] = spatMean_u1_old + (spatMean_u1-spatMean_u1_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+4)*nPoints+tProbe] = spatMean_v1_old + (spatMean_v1-spatMean_v1_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+5)*nPoints+tProbe] = spatMean_w1_old + (spatMean_w1-spatMean_w1_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+6)*nPoints+tProbe] = spatMean_u_star_old +(spatMean_u_star-spatMean_u_star_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+7)*nPoints+tProbe] = spatMean_Fx_old + (spatMean_Fx-spatMean_Fx_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+8)*nPoints+tProbe] = spatMean_Fy_old + (spatMean_Fy-spatMean_Fy_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+9)*nPoints+tProbe] = spatMean_Fz_old + (spatMean_Fz-spatMean_Fz_old)/n;
+
+            if(this->evaluatePressureGradient)
+            {
+            real spatMean_dpdx_old     = probeStruct->quantitiesArrayH[(arrOff+10)*nPoints+tProbe-1];
+            real spatMean_dpdy_old     = probeStruct->quantitiesArrayH[(arrOff+11)*nPoints+tProbe-1];
+            real spatMean_dpdz_old     = probeStruct->quantitiesArrayH[(arrOff+12)*nPoints+tProbe-1];
+            probeStruct->quantitiesArrayH[(arrOff+10)*nPoints+tProbe] = spatMean_dpdx_old + (spatMean_dpdx-spatMean_dpdx_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+11)*nPoints+tProbe] = spatMean_dpdy_old + (spatMean_dpdy-spatMean_dpdy_old)/n;
+            probeStruct->quantitiesArrayH[(arrOff+12)*nPoints+tProbe] = spatMean_dpdz_old + (spatMean_dpdz-spatMean_dpdz_old)/n;
+            }
+        }    
+    }
+
+    this->tProbe += 1;
+    getLastCudaError("WallModelProbe::calculateQuantities execution failed");
+}
+
diff --git a/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.h b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6464c5ca2aa60310cc6bb7ca0a210bc12e755ff
--- /dev/null
+++ b/src/gpu/VirtualFluids_GPU/PreCollisionInteractor/Probes/WallModelProbe.h
@@ -0,0 +1,87 @@
+//=======================================================================================
+// ____          ____    __    ______     __________   __      __       __        __         
+// \    \       |    |  |  |  |   _   \  |___    ___| |  |    |  |     /  \      |  |        
+//  \    \      |    |  |  |  |  |_)   |     |  |     |  |    |  |    /    \     |  |        
+//   \    \     |    |  |  |  |   _   /      |  |     |  |    |  |   /  /\  \    |  |        
+//    \    \    |    |  |  |  |  | \  \      |  |     |   \__/   |  /  ____  \   |  |____    
+//     \    \   |    |  |__|  |__|  \__\     |__|      \________/  /__/    \__\  |_______|   
+//      \    \  |    |   ________________________________________________________________    
+//       \    \ |    |  |  ______________________________________________________________|   
+//        \    \|    |  |  |         __          __     __     __     ______      _______    
+//         \         |  |  |_____   |  |        |  |   |  |   |  |   |   _  \    /  _____)   
+//          \        |  |   _____|  |  |        |  |   |  |   |  |   |  | \  \   \_______    
+//           \       |  |  |        |  |_____   |   \_/   |   |  |   |  |_/  /    _____  |
+//            \ _____|  |__|        |________|   \_______/    |__|   |______/    (_______/   
+//
+//  This file is part of VirtualFluids. VirtualFluids is free software: you can 
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of 
+//  the License, or (at your option) any later version.
+//  
+//  VirtualFluids is distributed in the hope that it will be useful, but WITHOUT 
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License 
+//  for more details.
+//  
+//  You should have received a copy of the GNU General Public License along
+//  with VirtualFluids (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file WallModelProbe.h
+//! \author Henrik Asmuth
+//! \date 13/05/2022
+//! \brief Probe computing statistics of all relevant wall model quantities used in the StressBC kernels
+//!
+//! Computes spatial statistics for all grid points of the StressBC 
+//! The spatial statistics can additionally be averaged in time.
+//!
+//=======================================================================================
+
+#ifndef WallModelProbe_H
+#define WallModelProbe_H
+
+#include "Probe.h"
+
+///////////////////////////////////////////////////////////////////////////////////
+
+class WallModelProbe : public Probe
+{
+public: 
+    WallModelProbe(
+        const std::string _probeName,
+        const std::string _outputPath,
+        uint _tStartAvg,
+        uint _tStartTmpAvg,
+        uint _tAvg,
+        uint _tStartOut,
+        uint _tOut
+    ):  Probe(_probeName, 
+             _outputPath,
+             _tStartAvg,
+             _tStartTmpAvg,
+             _tAvg,
+             _tStartOut, 
+             _tOut,
+             false,
+             true){}
+
+
+    void setForceOutputToStress(bool _outputStress){ this->outputStress = _outputStress; }
+    void setEvaluatePressureGradient(bool _evalPressGrad){ this->evaluatePressureGradient = _evalPressGrad; }
+
+private:
+    bool isAvailableStatistic(Statistic _variable) override;
+
+    std::vector<PostProcessingVariable> getPostProcessingVariables(Statistic variable) override;
+
+    void findPoints(Parameter* para, GridProvider* gridProvider, std::vector<int>& probeIndices_level,
+                    std::vector<real>& distX_level, std::vector<real>& distY_level, std::vector<real>& distZ_level,      
+                    std::vector<real>& pointCoordsX_level, std::vector<real>& pointCoordsY_level, std::vector<real>& pointCoordsZ_level,
+                    int level) override;
+    void calculateQuantities(SPtr<ProbeStruct> probeStruct, Parameter* para, uint t, int level) override;
+
+private:
+    bool outputStress = false; //!> if true, output wall force is converted to a stress 
+    bool evaluatePressureGradient = false; //!> if true, mean global pressure gradient will also be evaluated
+};
+
+#endif
\ No newline at end of file