diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
old mode 100644
new mode 100755
index e171e2e7fbe1984588355f5a833a21160024da32..be6533afdfd82ee2377ddba11837b938ea7cd311
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -235,7 +235,7 @@ gcc_9_python_hpc_test:
     - hpc-rocket launch --watch Python/SlurmTests/poiseuille/rocket.yml
 
 ###############################################################################
-multigpu_hpc_test:
+regression_test_4gpu:
   image: python:latest
   stage: test
 
@@ -252,12 +252,13 @@ multigpu_hpc_test:
     - pip install "fieldcompare[all]"
 
   script:
-    - hpc-rocket launch --watch regression-tests/multigpu_test/rocket.yml
+    - hpc-rocket launch --watch regression-tests/multigpu_test/rocket4GPU.yml
     - git clone --depth 1 --filter=blob:none --sparse https://github.com/irmb/test_data
     - cd test_data
-    - git sparse-checkout set regression_tests/gpu/DrivenCavity_4GPU_2Levels
+    - git sparse-checkout set regression_tests/gpu/DrivenCavity_4GPU_2Levels regression_tests/gpu/SphereScaling_4GPU_2Levels
     - cd ..
-    - fieldcompare dir output/results test_data/regression_tests/gpu/DrivenCavity_4GPU_2Levels --include-files "*.vtu"
+    - fieldcompare dir output/4GPU test_data/regression_tests/gpu/DrivenCavity_4GPU_2Levels --include-files "DrivenCavityMultiGPU*.vtu"
+    - fieldcompare dir output/4GPU test_data/regression_tests/gpu/SphereScaling_4GPU_2Levels --include-files "SphereScaling*.vtu"
 
 ###############################################################################
 ##                            Benchmark                                      ##
diff --git a/CMake/cmake_config_files/PHOENIX.config.cmake b/CMake/cmake_config_files/PHOENIX.config.cmake
index 5ca4d9821d918f66745fc27363975811dc278440..d31d8684a53a769e48408ad5febe7d2c6b22c623 100644
--- a/CMake/cmake_config_files/PHOENIX.config.cmake
+++ b/CMake/cmake_config_files/PHOENIX.config.cmake
@@ -28,7 +28,7 @@ set(CMAKE_CUDA_ARCHITECTURES 60) # NVIDIA Tesla P100
 
 set(GPU_APP "apps/gpu/LBM/")
 list(APPEND USER_APPS 
-    "${GPU_APP}DrivenCavityMultiGPU"
+    # "${GPU_APP}DrivenCavityMultiGPU"
     # "${GPU_APP}SphereScaling"
     # "${GPU_APP}MusselOyster"
     )
diff --git a/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp b/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
index b4aba451a253135a19472145921d7e7461a89ded..4c91bf148d422838bdfe647b53e40c97166b9f4e 100644
--- a/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
+++ b/apps/gpu/LBM/BoundaryLayer/BoundaryLayer.cpp
@@ -95,18 +95,12 @@
 #include "utilities/communication.h"
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
 std::string path(".");
 
 std::string simulationName("BoundaryLayer");
 
 using namespace vf::lbm::constant;
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
 
 void multipleLevel(const std::string& configPath)
 {
@@ -284,7 +278,7 @@ void multipleLevel(const std::string& configPath)
 
     gridBuilder->addCoarseGrid( xGridMin,  0.0,  0.0,
                                 xGridMax,  L_y,  L_z, dx);
-    if(true)// Add refinement
+    if(false)// Add refinement
     {
         gridBuilder->setNumberOfLayers(4,0);
         real xMaxRefinement = readPrecursor? xGridMax-H: xGridMax;   //Stop refinement some distance before outlet if domain ist not periodic
@@ -350,13 +344,14 @@ void multipleLevel(const std::string& configPath)
 
     gridBuilder->setStressBoundaryCondition(SideType::MZ,
                                             0.0, 0.0, 1.0,              // wall normals
-                                            samplingOffset, z0, dx);     // wall model settinng
+                                            samplingOffset, z0, dx);    // wall model settinng
+
     para->setHasWallModelMonitor(true);   
     gridBuilder->setSlipBoundaryCondition(SideType::PZ,  0.0f,  0.0f, -1.0f); 
 
     bcFactory.setVelocityBoundaryCondition(BoundaryConditionFactory::VelocityBC::VelocityCompressible);
     bcFactory.setStressBoundaryCondition(BoundaryConditionFactory::StressBC::StressPressureBounceBack);
-    bcFactory.setSlipBoundaryCondition(BoundaryConditionFactory::SlipBC::SlipBounceBack); 
+    bcFactory.setSlipBoundaryCondition(BoundaryConditionFactory::SlipBC::SlipCompressibleTurbulentViscosity); 
     bcFactory.setPressureBoundaryCondition(BoundaryConditionFactory::PressureBC::OutflowNonReflective);
     bcFactory.setPrecursorBoundaryCondition(useDistributions ? BoundaryConditionFactory::PrecursorBC::DistributionsPrecursor : BoundaryConditionFactory::PrecursorBC::VelocityPrecursor);
     para->setOutflowPressureCorrectionFactor(0.0); 
diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp b/apps/gpu/LBM/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp
old mode 100644
new mode 100755
index 97f2da1f0e6a0e9bc1250382808fb6bf9ea8462a..46e40044d7b3dca0cfc842ef775f680f0922e9b8
--- a/apps/gpu/LBM/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp
+++ b/apps/gpu/LBM/DrivenCavityMultiGPU/DrivenCavityMultiGPU.cpp
@@ -93,7 +93,7 @@ void multipleLevel(std::filesystem::path& configPath)
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-    const std::string outPath("output/");
+    const std::string outPath("output/" + std::to_string(para->getNumprocs()) + "GPU/");
     const std::string gridPath = "output/";
     std::string simulationName("DrivenCavityMultiGPU");
 
diff --git a/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU.txt b/apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU_regressionTest.txt
similarity index 100%
rename from apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU.txt
rename to apps/gpu/LBM/DrivenCavityMultiGPU/configPhoenix4GPU_regressionTest.txt
diff --git a/apps/gpu/LBM/SphereScaling/SphereScaling.cpp b/apps/gpu/LBM/SphereScaling/SphereScaling.cpp
old mode 100644
new mode 100755
index f33b5e75db60ea88638c2b2cdc11bdb4999088f5..43103a2f02522f7b3898027f9b807fdb84dd9d5c
--- a/apps/gpu/LBM/SphereScaling/SphereScaling.cpp
+++ b/apps/gpu/LBM/SphereScaling/SphereScaling.cpp
@@ -62,28 +62,6 @@
 
 #include "utilities/communication.h"
 
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//
-//          U s e r    s e t t i n g s
-//
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Phoenix
-// const std::string outPath("/work/y0078217/Results/SphereScalingResults/");
-// const std::string gridPathParent = "/work/y0078217/Grids/GridSphereScaling/";
-// const std::string simulationName("SphereScaling");
-// const std::string stlPath("/home/y0078217/STL/Sphere/");
-
-// Relative Paths
-const std::string outPath("./output/SphereScalingResults/");
-const std::string gridPathParent = "./output/grids/SphereScalingResults/";
-const std::string simulationName("SphereScaling");
-const std::string stlPath("./stl/SphereScaling/");
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -119,10 +97,13 @@ void multipleLevel(std::filesystem::path& configPath)
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
     bool useGridGenerator   = true;
-    bool useLevels          = true;
+    bool useLevels = true;
     std::string scalingType = "strong"; // "strong" // "weak"
-    // para->setUseStreams(true);                        // set in config
-    // para->useReducedCommunicationAfterFtoC = true;    // set in config
+
+    const std::string outPath("output/" + std::to_string(para->getNumprocs()) + "GPU/");
+    const std::string simulationName("SphereScaling");
+    const std::string gridPath = "./output/grids/";
+    const std::string stlPath("./stl/SphereScaling/");
 
     if (para->getNumprocs() == 1) {
         para->useReducedCommunicationAfterFtoC = false;
@@ -130,10 +111,9 @@ void multipleLevel(std::filesystem::path& configPath)
     if (scalingType != "weak" && scalingType != "strong")
         std::cerr << "unknown scaling type" << std::endl;
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-    std::string gridPath(gridPathParent); // only for GridGenerator, for GridReader the gridPath needs to be set in the config file
 
     real dxGrid      = (real)1.0;
-    real vxLB        = (real)0.0005; // LB units
+    real vxLB        = (real)0.005;  // LB units
     real viscosityLB = 0.001;        //(vxLB * dxGrid) / Re;
 
     para->setVelocityLB(vxLB);
@@ -142,14 +122,9 @@ void multipleLevel(std::filesystem::path& configPath)
     para->setViscosityRatio((real)0.058823529);
     para->setDensityRatio((real)998.0);
 
-
-    // para->setTimestepOut(10);
-    // para->setTimestepEnd(10);
-
     para->setCalcDragLift(false);
     para->setUseWale(false);
 
-
     para->setOutputPrefix(simulationName);
     if (para->getOutputPath() == "output/") {para->setOutputPath(outPath);}
     para->setPrintFiles(true);
@@ -159,12 +134,8 @@ void multipleLevel(std::filesystem::path& configPath)
     else
         para->setMaxLevel(1);
 
-    // para->setMainKernel("CumulantK17CompChim");
-    para->setMainKernel("CumulantK17CompChimStream");
-    //para->setMainKernel("CumulantK17CompChimRedesigned");
-    scalingFactory.setScalingFactory(GridScalingFactory::GridScaling::ScaleRhoSq);
-
-
+    para->setMainKernel("CumulantK17");
+    scalingFactory.setScalingFactory(GridScalingFactory::GridScaling::ScaleCompressible);
 
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
diff --git a/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_regressionTest.txt b/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_regressionTest.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c5789cdf96049b7c0a31ce693c29cd2db4952a58
--- /dev/null
+++ b/apps/gpu/LBM/SphereScaling/configPhoenix4GPU_regressionTest.txt
@@ -0,0 +1,17 @@
+##################################################
+#GPU Mapping
+##################################################
+Devices="0 1 2 3"
+NumberOfDevices=4
+
+##################################################
+#simulation parameter
+##################################################
+TimeEnd=10000
+TimeOut=10000
+
+##################################################
+# CUDA Streams and optimized communication (only used for multiple GPUs)
+##################################################
+useStreams = true
+useReducedCommunicationInInterpolation = true
\ No newline at end of file
diff --git a/pythonbindings/pyfluids-stubs/bindings/gpu/grid_generator.pyi b/pythonbindings/pyfluids-stubs/bindings/gpu/grid_generator.pyi
index 8d715e4b4cd49e6dbf92da3aedddbc4b869067c4..514dc5053e9574b452d80f61eb3d4e1ebb0f4396 100644
--- a/pythonbindings/pyfluids-stubs/bindings/gpu/grid_generator.pyi
+++ b/pythonbindings/pyfluids-stubs/bindings/gpu/grid_generator.pyi
@@ -67,7 +67,7 @@ class LevelGridBuilder(GridBuilder):
     def set_precursor_boundary_condition(self, side_type: pyfluids.bindings.gpu.SideType, file_collection: pyfluids.bindings.gpu.VelocityFileCollection, n_t_read: int, velocity_x: float = ..., velocity_y: float = ..., velocity_z: float = ..., file_level_to_grid_level_map: List[int] = ...) -> None: ...
     def set_pressure_boundary_condition(self, side_type: pyfluids.bindings.gpu.SideType, rho: float) -> None: ...
     def set_slip_boundary_condition(self, side_type: pyfluids.bindings.gpu.SideType, normal_x: float, normal_y: float, normal_z: float) -> None: ...
-    def set_stress_boundary_condition(self, side_type: pyfluids.bindings.gpu.SideType, normal_x: float, normal_y: float, normal_z: float, sampling_offset: int, z0: float, dx: float) -> None: ...
+    def set_stress_boundary_condition(self, side_type: pyfluids.bindings.gpu.SideType, normal_x: float, normal_y: float, normal_z: float, sampling_offset: int, z0: float, dx: float, q: float) -> None: ...
     def set_velocity_boundary_condition(self, side_type: pyfluids.bindings.gpu.SideType, vx: float, vy: float, vz: float) -> None: ...
 
 class MultipleGridBuilder(LevelGridBuilder):
diff --git a/regression-tests/multigpu_test/rocket.yml b/regression-tests/multigpu_test/rocket4GPU.yml
similarity index 72%
rename from regression-tests/multigpu_test/rocket.yml
rename to regression-tests/multigpu_test/rocket4GPU.yml
index f621b1349c042e02f2e834e697147da0822ffe1f..a05ffea6ad04e0d5cfb8d7749111726dfceb4609 100755
--- a/regression-tests/multigpu_test/rocket.yml
+++ b/regression-tests/multigpu_test/rocket4GPU.yml
@@ -3,8 +3,8 @@ user: $PHOENIX_REMOTE_USER
 private_keyfile: $PHOENIX_PRIVATE_KEY
 
 copy:
-  - from: regression-tests/multigpu_test/slurm.job
-    to: multigpu_test/slurm.job
+  - from: regression-tests/multigpu_test/slurm4GPU.job
+    to: multigpu_test/slurm4GPU.job
     overwrite: true
 
   - from: "CMake/"
@@ -36,13 +36,16 @@ copy:
     overwrite: true
 
 collect:
-  - from: multigpu_test/output/
-    to: output/results/
+  - from: multigpu_test/output/4GPU/
+    to: output/4GPU
     overwrite: true
 
-  - from: multigpu_test/slurmMultiGPU.out
-    to: output/slurmMultiGPU.out
+  - from: multigpu_test/slurm4GPU.out
+    to: output/4GPU/slurm4GPU.out
     overwrite: true
 
-sbatch: multigpu_test/slurm.job
+clean:
+  - multigpu_test/output/*
+
+sbatch: multigpu_test/slurm4GPU.job
 continue_if_job_fails: true
diff --git a/regression-tests/multigpu_test/slurm.job b/regression-tests/multigpu_test/slurm4GPU.job
similarity index 61%
rename from regression-tests/multigpu_test/slurm.job
rename to regression-tests/multigpu_test/slurm4GPU.job
index 0ee0df46ab64bab6520f9f46fc939d5b3186fae7..886bfaf7479e01cfef285e9a2dae189258ce0b7e 100755
--- a/regression-tests/multigpu_test/slurm.job
+++ b/regression-tests/multigpu_test/slurm4GPU.job
@@ -3,10 +3,10 @@
 #SBATCH --partition=gpu01_queue
 #SBATCH --nodes=1
 #SBATCH --time=10:00:00
-#SBATCH --job-name=Cavity4GPU
+#SBATCH --job-name=Regr4GPU
 #SBATCH --ntasks-per-node=4
 #SBATCH --gres=gpu:4
-#SBATCH --output=multigpu_test/slurmMultiGPU.out
+#SBATCH --output=multigpu_test/slurm4GPU.out
 ##SBATCH --exclusive
 
 module purge 
@@ -21,9 +21,13 @@ module list
 cd multigpu_test
 mkdir -p build
 cd build
-cmake .. -DBUILD_VF_GPU=ON -DCMAKE_CUDA_ARCHITECTURES=60 -DUSER_APPS="apps/gpu/LBM/DrivenCavityMultiGPU"
+cmake .. -DBUILD_VF_GPU=ON -DCMAKE_CUDA_ARCHITECTURES=60 -DUSER_APPS=apps/gpu/LBM/DrivenCavityMultiGPU\;apps/gpu/LBM/SphereScaling
 make -j 16
 cd ..
 mkdir -p output
 
-mpirun -np 4 "./build/bin/DrivenCavityMultiGPU" "configPhoenix4GPU.txt"
\ No newline at end of file
+echo $'\n\n\n\n---First test: DrivenCavityMultiGPU on 4 GPUs\n\n'
+mpirun -np 4 "./build/bin/DrivenCavityMultiGPU" "configPhoenix4GPU_regressionTest.txt"
+
+echo $'\n\n\n\n---Second test: SphereScaling on 4 GPUs\n\n'
+mpirun -np 4 "./build/bin/SphereScaling"        "configPhoenix4GPU_regressionTest.txt"
\ No newline at end of file
diff --git a/src/gpu/GridGenerator/TransientBCSetter/TransientBCSetter.cpp b/src/gpu/GridGenerator/TransientBCSetter/TransientBCSetter.cpp
index 5f3c4ad492b16c09b26acd00a624a54ad65dffda..571796d503a1a73b3eccf631a347884c7522b533 100644
--- a/src/gpu/GridGenerator/TransientBCSetter/TransientBCSetter.cpp
+++ b/src/gpu/GridGenerator/TransientBCSetter/TransientBCSetter.cpp
@@ -417,7 +417,7 @@ void VTKReader::getNextData(real* data, uint numberOfNodes, real time)
             {
                 numberOfFiles++;
 
-                printf("switching to precursor file no. %zu\n", numberOfFiles);
+                VF_LOG_INFO("PrecursorBC on level {}: switching to file no. {}\n", level, numberOfFiles);
                 if(numberOfFiles == this->fileCollection->files[level][id].size())
                     throw std::runtime_error("Not enough Precursor Files to read");
 
diff --git a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
index ba4eea50ffb6bc136528db31207274d626fe9b15..718a8d5da1de148c72ba67dd2d15c5e3b443e16a 100644
--- a/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
+++ b/src/gpu/GridGenerator/grid/BoundaryConditions/Side.cpp
@@ -75,14 +75,13 @@ void Side::addIndices(SPtr<Grid> grid, SPtr<BoundaryCondition> boundaryCondition
                                             ||  grid->getFieldEntry(index) == vf::gpu::FLUID_FCF
                                             // Overlap of BCs on edge nodes
                                             || grid->nodeHasBC(index) )
-            {
+            {   
                 grid->setFieldEntry(index, boundaryCondition->getType());
                 boundaryCondition->indices.push_back(index);
                 setPressureNeighborIndices(boundaryCondition, grid, index);
                 setStressSamplingIndices(boundaryCondition, grid, index);
-
+                // if(grid->getFieldEntry(index)==26) printf("index = %u, v1 = %f, v2 = %f, field entry=%u \n", index, v1, v2, grid->getFieldEntry(index) );
                 setQs(grid, boundaryCondition, index);
-
                 boundaryCondition->patches.push_back(0);
             }
         }
diff --git a/src/gpu/GridGenerator/grid/GridImp.cpp b/src/gpu/GridGenerator/grid/GridImp.cpp
index 32cf9d07da87149695a5bf548ed357be2b2f71b4..9fc3c099e9f382c8d22fa64626c89219ef29c360 100644
--- a/src/gpu/GridGenerator/grid/GridImp.cpp
+++ b/src/gpu/GridGenerator/grid/GridImp.cpp
@@ -2115,16 +2115,22 @@ void GridImp::sortFluidNodeIndicesMacroVars()
         if(this->fluidNodeIndicesAllFeatures.size()>0)
         {
             this->fluidNodeIndicesMacroVars.erase(   std::remove_if(   this->fluidNodeIndicesMacroVars.begin(), this->fluidNodeIndicesMacroVars.end(),
-                                                        [&](auto x){return binary_search(fluidNodeIndicesAllFeatures.begin(),fluidNodeIndicesAllFeatures.end(),x);} ),
-                                            this->fluidNodeIndicesMacroVars.end()
-                                        );
+                                                    [&](auto x){return binary_search(fluidNodeIndicesAllFeatures.begin(),fluidNodeIndicesAllFeatures.end(),x);} ),
+                                                    this->fluidNodeIndicesMacroVars.end() );
+        }
+
+        // Remove all indices in fluidNodeIndicesBorder from fluidNodeIndicesApplyBodyForce
+        if(this->fluidNodeIndicesBorder.size()>0)
+        {
+            this->fluidNodeIndicesMacroVars.erase(  std::remove_if(   this->fluidNodeIndicesMacroVars.begin(), this->fluidNodeIndicesMacroVars.end(),
+                                                    [&](auto x){return binary_search(fluidNodeIndicesBorder.begin(),fluidNodeIndicesBorder.end(),x);} ),
+                                                    this->fluidNodeIndicesMacroVars.end() );
         }
 
         // Remove indices of fluidNodeIndicesMacroVars from fluidNodeIndices
         this->fluidNodeIndices.erase(   std::remove_if(   this->fluidNodeIndices.begin(), this->fluidNodeIndices.end(),
                                                         [&](auto x){return binary_search(fluidNodeIndicesMacroVars.begin(),fluidNodeIndicesMacroVars.end(),x);} ),
-                                        this->fluidNodeIndices.end()
-                                    );
+                                        this->fluidNodeIndices.end() );
     }
 }
 
@@ -2136,20 +2142,26 @@ void GridImp::sortFluidNodeIndicesApplyBodyForce()
         // Remove duplicates
         this->fluidNodeIndicesApplyBodyForce.erase( unique( this->fluidNodeIndicesApplyBodyForce.begin(), this->fluidNodeIndicesApplyBodyForce.end() ), this->fluidNodeIndicesApplyBodyForce.end() );
 
-         // Remove indices of fluidNodeIndicesAllFeatures from fluidNodeIndicesMacroVars
+         // Remove indices of fluidNodeIndicesAllFeatures from fluidNodeIndicesApplyBodyForce
         if(this->fluidNodeIndicesAllFeatures.size()>0)
         {
-            this->fluidNodeIndicesApplyBodyForce.erase(   std::remove_if(   this->fluidNodeIndicesApplyBodyForce.begin(), this->fluidNodeIndicesApplyBodyForce.end(),
+            this->fluidNodeIndicesApplyBodyForce.erase( std::remove_if(   this->fluidNodeIndicesApplyBodyForce.begin(), this->fluidNodeIndicesApplyBodyForce.end(),
                                                         [&](auto x){return binary_search(fluidNodeIndicesAllFeatures.begin(),fluidNodeIndicesAllFeatures.end(),x);} ),
-                                            this->fluidNodeIndicesApplyBodyForce.end()
-                                        );
+                                                        this->fluidNodeIndicesApplyBodyForce.end() );
+        }
+
+        // Remove all indices in fluidNodeIndicesBorder from fluidNodeIndicesApplyBodyForce
+        if(this->fluidNodeIndicesBorder.size()>0)
+        {
+            this->fluidNodeIndicesApplyBodyForce.erase( std::remove_if(   this->fluidNodeIndicesApplyBodyForce.begin(), this->fluidNodeIndicesApplyBodyForce.end(),
+                                                        [&](auto x){return binary_search(fluidNodeIndicesBorder.begin(),fluidNodeIndicesBorder.end(),x);} ),
+                                                        this->fluidNodeIndicesApplyBodyForce.end() );
         }
 
         // Remove indices of fluidNodeIndicesMacroVars from fluidNodeIndices
         this->fluidNodeIndices.erase(   std::remove_if(   this->fluidNodeIndices.begin(), this->fluidNodeIndices.end(),
-                                                        [&](auto x){return binary_search(fluidNodeIndicesApplyBodyForce.begin(),fluidNodeIndicesApplyBodyForce.end(),x);} ),
-                                        this->fluidNodeIndices.end()
-                                    );
+                                        [&](auto x){return binary_search(fluidNodeIndicesApplyBodyForce.begin(),fluidNodeIndicesApplyBodyForce.end(),x);} ),
+                                        this->fluidNodeIndices.end() );
     }
 }
 
@@ -2160,11 +2172,19 @@ void GridImp::sortFluidNodeIndicesAllFeatures()
         sort(this->fluidNodeIndicesAllFeatures.begin(), this->fluidNodeIndicesAllFeatures.end());
         // Remove duplicates
         this->fluidNodeIndicesAllFeatures.erase( unique( this->fluidNodeIndicesAllFeatures.begin(), this->fluidNodeIndicesAllFeatures.end() ), this->fluidNodeIndicesAllFeatures.end() );
-        // Remove indices of fluidNodeIndicesMacroVars from fluidNodeIndices
+
+        // Remove all indices in fluidNodeIndicesBorder from fluidNodeIndicesAllFeatures
+        if(this->fluidNodeIndicesBorder.size()>0)
+        {
+            this->fluidNodeIndicesAllFeatures.erase(    std::remove_if(   this->fluidNodeIndicesAllFeatures.begin(), this->fluidNodeIndicesAllFeatures.end(),
+                                                        [&](auto x){return binary_search(fluidNodeIndicesBorder.begin(),fluidNodeIndicesBorder.end(),x);} ),
+                                                        this->fluidNodeIndicesAllFeatures.end() );
+        }
+
+        // Remove indices of fluidNodeIndicesAllFeatures from fluidNodeIndices
         this->fluidNodeIndices.erase(   std::remove_if(   this->fluidNodeIndices.begin(), this->fluidNodeIndices.end(),
                                                         [&](auto x){return binary_search(fluidNodeIndicesAllFeatures.begin(),fluidNodeIndicesAllFeatures.end(),x);} ),
-                                        this->fluidNodeIndices.end()
-                                    );
+                                        this->fluidNodeIndices.end() );
     }
 }
 
diff --git a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
index 38a7eef7e356e2f2da4c1a819d8375035a37313a..c2f86721de26d516ed60f497a65d1d46a34aa182 100644
--- a/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
+++ b/src/gpu/VirtualFluids_GPU/DataStructureInitializer/GridReaderGenerator/GridGenerator.cpp
@@ -1002,7 +1002,7 @@ void GridGenerator::allocArrays_BoundaryQs()
             unsigned int sizeQ = para->getParH(i)->stressBC.numberOfBCnodes;
             QforBoundaryConditions &Q = para->getParH(i)->stressBC;
             getPointersToBoundaryConditions(Q, QQ, sizeQ);
-
+            
             builder->getStressQs(Q.q27, i);
             ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
             cudaMemoryManager->cudaCopyStressBC(i);
diff --git a/src/gpu/VirtualFluids_GPU/Factories/GridScalingFactory.cpp b/src/gpu/VirtualFluids_GPU/Factories/GridScalingFactory.cpp
index 00a4c79574ce9d8ae372bfe9f7e546c05175bb10..49a6887ef2e462aba190023d334caa0012e2254e 100644
--- a/src/gpu/VirtualFluids_GPU/Factories/GridScalingFactory.cpp
+++ b/src/gpu/VirtualFluids_GPU/Factories/GridScalingFactory.cpp
@@ -6,7 +6,7 @@ void GridScalingFactory::setScalingFactory(const GridScalingFactory::GridScaling
     this->gridScaling = gridScalingType;
 }
 
-gridScalingFC GridScalingFactory::getGridScalingFC() const
+gridScalingFC GridScalingFactory::getGridScalingFC(bool hasTurbulentViscosity) const
 {
     // for descriptions of the scaling types refer to the header
     switch (gridScaling) {
@@ -14,14 +14,15 @@ gridScalingFC GridScalingFactory::getGridScalingFC() const
             return ScaleFC_RhoSq_comp_27;
             break;
         case GridScaling::ScaleCompressible:
-            return ScaleFC_compressible;
+            if(hasTurbulentViscosity)   return ScaleFC_compressible<true>;
+            else                        return ScaleFC_compressible<false>;
             break;
         default:
             return nullptr;
     }
 }
 
-gridScalingCF GridScalingFactory::getGridScalingCF() const
+gridScalingCF GridScalingFactory::getGridScalingCF(bool hasTurbulentViscosity) const
 {
     // for descriptions of the scaling types refer to the header
     switch (gridScaling) {
@@ -29,8 +30,11 @@ gridScalingCF GridScalingFactory::getGridScalingCF() const
             return ScaleCF_RhoSq_comp_27;
             break;
         case GridScaling::ScaleCompressible:
-            return ScaleCF_compressible;
-            break;
+            {
+                if(hasTurbulentViscosity)   return ScaleCF_compressible<true>;
+                else                        return ScaleCF_compressible<false>;
+                break;
+            }
         default:
             return nullptr;
     }
diff --git a/src/gpu/VirtualFluids_GPU/Factories/GridScalingFactory.h b/src/gpu/VirtualFluids_GPU/Factories/GridScalingFactory.h
index 7d7c20c63a01e2dba6a5578c6520c0ab06894b3c..d760240c2c5ed429799cd89e57704464515a92f5 100644
--- a/src/gpu/VirtualFluids_GPU/Factories/GridScalingFactory.h
+++ b/src/gpu/VirtualFluids_GPU/Factories/GridScalingFactory.h
@@ -59,8 +59,8 @@ public:
 
     void setScalingFactory(const GridScalingFactory::GridScaling gridScalingType);
 
-    [[nodiscard]] gridScalingFC getGridScalingFC() const;
-    [[nodiscard]] gridScalingCF getGridScalingCF() const;
+    [[nodiscard]] gridScalingFC getGridScalingFC(bool hasTurbulentViscosity) const;
+    [[nodiscard]] gridScalingCF getGridScalingCF(bool hasTurbulentViscosity) const;
 
 private:
     GridScaling gridScaling = GridScaling::NotSpecified;
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h b/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
index ae8cbb77ec2493126d64b90a7119cbfa3efee666..4a5b7816c1b6591e4193639bcdf71242e77688c0 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
+++ b/src/gpu/VirtualFluids_GPU/GPU/GPU_Interface.h
@@ -1624,7 +1624,8 @@ void ScaleCF_staggered_time_comp_27( real* DC,
 												OffCF offCF);
 
 void ScaleCF_RhoSq_comp_27(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellCF * icellCF, OffCF &offsetCF, CUstream_st *stream);
-void ScaleCF_compressible(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellCF * icellCF, OffCF &offsetCF, CUstream_st *stream);
+
+template<bool hasTurbulentViscosity> void ScaleCF_compressible(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellCF * icellCF, OffCF &offsetCF, CUstream_st *stream);
 
 void ScaleCF_RhoSq_3rdMom_comp_27( real* DC, 
 											  real* DF, 
@@ -1849,7 +1850,8 @@ void ScaleFC_staggered_time_comp_27( real* DC,
 												OffFC offFC);
 
 void ScaleFC_RhoSq_comp_27(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellFC * icellFC, OffFC& offsetFC, CUstream_st *stream);
-void ScaleFC_compressible(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellFC * icellFC, OffFC& offsetFC, CUstream_st *stream);
+
+template<bool hasTurbulentViscosity> void ScaleFC_compressible(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellFC * icellFC, OffFC& offsetFC, CUstream_st *stream);
 
 void ScaleFC_RhoSq_3rdMom_comp_27( real* DC, 
 											  real* DF, 
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh b/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
index 3134db44346ee7f465a5c8f04505ee5749482fbf..0c3c7fcefc2bbb7bc87d7d95863c8c74f14735a3 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
+++ b/src/gpu/VirtualFluids_GPU/GPU/GPU_Kernels.cuh
@@ -1860,7 +1860,7 @@ __global__ void scaleCF_RhoSq_comp_27( real* DC,
                                                   unsigned int nyF,
                                                   OffCF offCF);
 
-__global__ void scaleCF_compressible(
+template<bool hasTurbulentViscosity> __global__ void scaleCF_compressible(
     real* distributionsCoarse,
     real* distributionsFine,
     unsigned int* neighborXcoarse,
@@ -1877,6 +1877,8 @@ __global__ void scaleCF_compressible(
     unsigned int numberOfInterfaceNodes,
     real omegaCoarse,
     real omegaFine,
+    real* turbulentViscosityCoarse,
+    real* turbulentViscosityFine,
     OffCF offsetCF);
 
 __global__ void scaleCF_RhoSq_3rdMom_comp_27(real* DC,
@@ -2263,7 +2265,7 @@ __global__ void scaleFC_RhoSq_comp_27( real* DC,
                                                   unsigned int nyF,
                                                   OffFC offFC);
 
-__global__ void scaleFC_compressible(
+template<bool hasTurbulentViscosity> __global__ void scaleFC_compressible(
     real *distributionsCoarse,
     real *distributionsFine,
     unsigned int *neighborXcoarse,
@@ -2280,6 +2282,8 @@ __global__ void scaleFC_compressible(
     unsigned int numberOfInterfaceNodes,
     real omegaCoarse,
     real omegaFine,
+    real* turbulentViscosityCoarse,
+    real* turbulentViscosityFine,
     OffFC offsetFC);
 
 __global__ void scaleFC_RhoSq_3rdMom_comp_27(real* DC,
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleCF_compressible.cu b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleCF_compressible.cu
index 0724002cffa3a47820664851ffefd1c35dbe0235..84529ef2694b57448291957f7792360088bd954f 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleCF_compressible.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleCF_compressible.cu
@@ -218,7 +218,7 @@ __device__ __inline__ void interpolateDistributions(
 //!
 
 // based on scaleCF_RhoSq_comp_27
-__global__ void scaleCF_compressible(
+template<bool hasTurbulentViscosity> __global__ void scaleCF_compressible(
     real* distributionsCoarse, 
     real* distributionsFine, 
     unsigned int* neighborXcoarse,
@@ -235,6 +235,8 @@ __global__ void scaleCF_compressible(
     unsigned int numberOfInterfaceNodes, 
     real omegaCoarse, 
     real omegaFine, 
+    real* turbulentViscosityCoarse,
+    real* turbulentViscosityFine,
     OffCF offsetCF)
 {
     ////////////////////////////////////////////////////////////////////////////////
@@ -310,6 +312,8 @@ __global__ void scaleCF_compressible(
     unsigned int k_0MM = k_base_0MM;
     unsigned int k_MMM = k_base_MMM;
 
+    if(hasTurbulentViscosity) omegaC = omegaCoarse / (c1o1 + c3o1*omegaCoarse*turbulentViscosityCoarse[k_000]);
+
     calculateMomentsOnSourceNodes( distCoarse, omegaC,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_MMM, vx1_MMM, vx2_MMM, vx3_MMM,
         kxyFromfcNEQ_MMM, kyzFromfcNEQ_MMM, kxzFromfcNEQ_MMM, kxxMyyFromfcNEQ_MMM, kxxMzzFromfcNEQ_MMM);
@@ -327,6 +331,8 @@ __global__ void scaleCF_compressible(
     k_0MM = neighborZcoarse[k_0MM];
     k_MMM = neighborZcoarse[k_MMM];
 
+    if(hasTurbulentViscosity) omegaC = omegaCoarse / (c1o1 + c3o1*omegaCoarse*turbulentViscosityCoarse[k_000]);
+
     calculateMomentsOnSourceNodes( distCoarse, omegaC,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_MMP, vx1_MMP, vx2_MMP, vx3_MMP,
         kxyFromfcNEQ_MMP, kyzFromfcNEQ_MMP, kxzFromfcNEQ_MMP, kxxMyyFromfcNEQ_MMP, kxxMzzFromfcNEQ_MMP);
@@ -344,6 +350,8 @@ __global__ void scaleCF_compressible(
     k_0MM = k_MMM;
     k_MMM = neighborXcoarse[k_MMM];
 
+    if(hasTurbulentViscosity) omegaC = omegaCoarse / (c1o1 + c3o1*omegaCoarse*turbulentViscosityCoarse[k_000]);
+
     calculateMomentsOnSourceNodes( distCoarse, omegaC,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_PMP, vx1_PMP, vx2_PMP, vx3_PMP,
         kxyFromfcNEQ_PMP, kyzFromfcNEQ_PMP, kxzFromfcNEQ_PMP, kxxMyyFromfcNEQ_PMP, kxxMzzFromfcNEQ_PMP);
@@ -361,6 +369,8 @@ __global__ void scaleCF_compressible(
     k_0M0 = k_base_MM0;
     k_MM0 = neighborXcoarse[k_base_MM0];
 
+    if(hasTurbulentViscosity) omegaC = omegaCoarse / (c1o1 + c3o1*omegaCoarse*turbulentViscosityCoarse[k_000]);
+
     calculateMomentsOnSourceNodes( distCoarse, omegaC,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_PMM, vx1_PMM, vx2_PMM, vx3_PMM,
         kxyFromfcNEQ_PMM, kyzFromfcNEQ_PMM, kxzFromfcNEQ_PMM, kxxMyyFromfcNEQ_PMM, kxxMzzFromfcNEQ_PMM);
@@ -388,6 +398,8 @@ __global__ void scaleCF_compressible(
     k_0MM = k_base_0MM;
     k_MMM = k_base_MMM;
 
+    if(hasTurbulentViscosity) omegaC = omegaCoarse / (c1o1 + c3o1*omegaCoarse*turbulentViscosityCoarse[k_000]);
+
     calculateMomentsOnSourceNodes( distCoarse, omegaC,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_MPM, vx1_MPM, vx2_MPM, vx3_MPM,
         kxyFromfcNEQ_MPM, kyzFromfcNEQ_MPM, kxzFromfcNEQ_MPM, kxxMyyFromfcNEQ_MPM, kxxMzzFromfcNEQ_MPM);
@@ -404,6 +416,8 @@ __global__ void scaleCF_compressible(
     k_M0M = neighborZcoarse[k_M0M];
     k_0MM = neighborZcoarse[k_0MM];
     k_MMM = neighborZcoarse[k_MMM];
+
+    if(hasTurbulentViscosity) omegaC = omegaCoarse / (c1o1 + c3o1*omegaCoarse*turbulentViscosityCoarse[k_000]);
     
     calculateMomentsOnSourceNodes( distCoarse, omegaC,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_MPP, vx1_MPP, vx2_MPP, vx3_MPP,
@@ -423,11 +437,12 @@ __global__ void scaleCF_compressible(
     k_0MM = k_MMM;
     k_MMM = neighborXcoarse[k_MMM];
 
+    if(hasTurbulentViscosity) omegaC = omegaCoarse / (c1o1 + c3o1*omegaCoarse*turbulentViscosityCoarse[k_000]);
+
     calculateMomentsOnSourceNodes( distCoarse, omegaC,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_PPP, vx1_PPP, vx2_PPP, vx3_PPP,
         kxyFromfcNEQ_PPP, kyzFromfcNEQ_PPP, kxzFromfcNEQ_PPP, kxxMyyFromfcNEQ_PPP, kxxMzzFromfcNEQ_PPP);
 
-
     //////////////////////////////////////////////////////////////////////////
     // source node BNE = PPM
     //////////////////////////////////////////////////////////////////////////
@@ -440,6 +455,8 @@ __global__ void scaleCF_compressible(
     k_M00 = neighborXcoarse[k_base_M00];
     k_0M0 = k_base_MM0;
     k_MM0 = neighborXcoarse[k_base_MM0];
+
+    if(hasTurbulentViscosity) omegaC = omegaCoarse / (c1o1 + c3o1*omegaCoarse*turbulentViscosityCoarse[k_000]);
     
     calculateMomentsOnSourceNodes( distCoarse, omegaC,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_PPM, vx1_PPM, vx2_PPM, vx3_PPM,
@@ -883,28 +900,6 @@ __global__ void scaleCF_compressible(
     real y = -c1o4;
     real z = -c1o4;
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-    ////////////////////////////////////////////////////////////////////////////////
-    //! - Set moments (zeroth to sixth order) on destination node
-    //!
-    interpolateDistributions(
-        x, y, z,
-        m_000, 
-        m_100, m_010, m_001,
-        m_011, m_101, m_110, m_200, m_020, m_002,
-        m_111, m_210, m_012, m_201, m_021, m_120, m_102,
-        m_022, m_202, m_220, m_211, m_121, m_112,
-        m_122, m_212, m_221,
-        m_222,
-        a_000, a_100, a_010, a_001, a_200, a_020, a_002, a_110,  a_101, a_011, a_111,
-        b_000, b_100, b_010, b_001, b_200, b_020, b_002, b_110,  b_101, b_011, b_111,
-        c_000, c_100, c_010, c_001, c_200, c_020, c_002, c_110,  c_101, c_011, c_111,
-        d_000, d_100, d_010, d_001, d_110, d_101, d_011, d_111,
-        LaplaceRho, eps_new, omegaF, 
-        kxxMyyAverage, kxxMzzAverage, kyzAverage, kxzAverage, kxyAverage
-    );
-
-    //////////////////////////////////////////////////////////////////////////
     // index of the base node and its neighbors
     k_base_000 = indicesFineMMM[nodeIndex];
     k_base_M00 = neighborXfine [k_base_000];
@@ -924,6 +919,28 @@ __global__ void scaleCF_compressible(
     k_M0M = k_base_M0M;
     k_0MM = k_base_0MM;
     k_MMM = k_base_MMM;
+    ////////////////////////////////////////////////////////////////////////////////
+    //! - Set moments (zeroth to sixth order) on destination node
+    //!
+
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
+    interpolateDistributions(
+        x, y, z,
+        m_000, 
+        m_100, m_010, m_001,
+        m_011, m_101, m_110, m_200, m_020, m_002,
+        m_111, m_210, m_012, m_201, m_021, m_120, m_102,
+        m_022, m_202, m_220, m_211, m_121, m_112,
+        m_122, m_212, m_221,
+        m_222,
+        a_000, a_100, a_010, a_001, a_200, a_020, a_002, a_110,  a_101, a_011, a_111,
+        b_000, b_100, b_010, b_001, b_200, b_020, b_002, b_110,  b_101, b_011, b_111,
+        c_000, c_100, c_010, c_001, c_200, c_020, c_002, c_110,  c_101, c_011, c_111,
+        d_000, d_100, d_010, d_001, d_110, d_101, d_011, d_111,
+        LaplaceRho, eps_new, omegaF, 
+        kxxMyyAverage, kxxMzzAverage, kyzAverage, kxzAverage, kxyAverage
+    );
 
     //////////////////////////////////////////////////////////////////////////
     //! - Write distributions: style of reading and writing the distributions from/to
@@ -968,9 +985,22 @@ __global__ void scaleCF_compressible(
     x = -c1o4;
     y = -c1o4;
     z =  c1o4;
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Set neighbor indices
+    k_000 = k_00M;
+    k_M00 = k_M0M;
+    k_0M0 = k_0MM;
+    k_00M = neighborZfine[k_00M];
+    k_MM0 = k_MMM;
+    k_M0M = neighborZfine[k_M0M];
+    k_0MM = neighborZfine[k_0MM];
+    k_MMM = neighborZfine[k_MMM];
 
     ////////////////////////////////////////////////////////////////////////////////
     // Set moments (zeroth to sixth orders) on destination node
+
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     interpolateDistributions(
         x, y, z,
         m_000, 
@@ -988,17 +1018,6 @@ __global__ void scaleCF_compressible(
         kxxMyyAverage, kxxMzzAverage, kyzAverage, kxzAverage, kxyAverage
     );
 
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Set neighbor indices
-    k_000 = k_00M;
-    k_M00 = k_M0M;
-    k_0M0 = k_0MM;
-    k_00M = neighborZfine[k_00M];
-    k_MM0 = k_MMM;
-    k_M0M = neighborZfine[k_M0M];
-    k_0MM = neighborZfine[k_0MM];
-    k_MMM = neighborZfine[k_MMM];
-
     //////////////////////////////////////////////////////////////////////////
     // Write distributions
     (distFine.f[DIR_000])[k_000] = f_000;
@@ -1038,9 +1057,21 @@ __global__ void scaleCF_compressible(
     y = -c1o4;
     z =  c1o4;
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Set neighbor indices
+    k_000 = k_M00;
+    k_M00 = neighborXfine[k_M00];
+    k_0M0 = k_MM0;
+    k_00M = k_M0M;
+    k_MM0 = neighborXfine[k_MM0];
+    k_M0M = neighborXfine[k_M0M];
+    k_0MM = k_MMM;
+    k_MMM = neighborXfine[k_MMM];
 
     ////////////////////////////////////////////////////////////////////////////////
     // Set moments (zeroth to sixth orders) on destination node
+
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     interpolateDistributions(
         x, y, z,
         m_000, 
@@ -1058,17 +1089,6 @@ __global__ void scaleCF_compressible(
         kxxMyyAverage, kxxMzzAverage, kyzAverage, kxzAverage, kxyAverage
     );
 
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Set neighbor indices
-    k_000 = k_M00;
-    k_M00 = neighborXfine[k_M00];
-    k_0M0 = k_MM0;
-    k_00M = k_M0M;
-    k_MM0 = neighborXfine[k_MM0];
-    k_M0M = neighborXfine[k_M0M];
-    k_0MM = k_MMM;
-    k_MMM = neighborXfine[k_MMM];
-
     //////////////////////////////////////////////////////////////////////////
     // Write distributions
     (distFine.f[DIR_000])[k_000] = f_000;
@@ -1107,9 +1127,22 @@ __global__ void scaleCF_compressible(
     x =  c1o4;
     y = -c1o4;
     z = -c1o4;
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Set neighbor indices
+    k_00M = k_000;
+    k_M0M = k_M00;
+    k_0MM = k_0M0;
+    k_MMM = k_MM0;
+    k_000 = k_base_M00;
+    k_M00 = neighborXfine[k_base_M00];
+    k_0M0 = k_base_MM0;
+    k_MM0 = neighborXfine[k_base_MM0];
 
     ////////////////////////////////////////////////////////////////////////////////
     // Set moments (zeroth to sixth orders) on destination node
+
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     interpolateDistributions(
         x, y, z,
         m_000, 
@@ -1127,17 +1160,6 @@ __global__ void scaleCF_compressible(
         kxxMyyAverage, kxxMzzAverage, kyzAverage, kxzAverage, kxyAverage
     );
 
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Set neighbor indices
-    k_00M = k_000;
-    k_M0M = k_M00;
-    k_0MM = k_0M0;
-    k_MMM = k_MM0;
-    k_000 = k_base_M00;
-    k_M00 = neighborXfine[k_base_M00];
-    k_0M0 = k_base_MM0;
-    k_MM0 = neighborXfine[k_base_MM0];
-
     //////////////////////////////////////////////////////////////////////////
     // Write distributions
     (distFine.f[DIR_000])[k_000] = f_000;
@@ -1177,25 +1199,6 @@ __global__ void scaleCF_compressible(
     y =  c1o4;
     z = -c1o4;
     
-    ////////////////////////////////////////////////////////////////////////////////
-    // Set moments (zeroth to sixth orders) on destination node
-    interpolateDistributions(
-        x, y, z,
-        m_000, 
-        m_100, m_010, m_001,
-        m_011, m_101, m_110, m_200, m_020, m_002,
-        m_111, m_210, m_012, m_201, m_021, m_120, m_102,
-        m_022, m_202, m_220, m_211, m_121, m_112,
-        m_122, m_212, m_221,
-        m_222,
-        a_000, a_100, a_010, a_001, a_200, a_020, a_002, a_110,  a_101, a_011, a_111,
-        b_000, b_100, b_010, b_001, b_200, b_020, b_002, b_110,  b_101, b_011, b_111,
-        c_000, c_100, c_010, c_001, c_200, c_020, c_002, c_110,  c_101, c_011, c_111,
-        d_000, d_100, d_010, d_001, d_110, d_101, d_011, d_111,
-        LaplaceRho, eps_new, omegaF, 
-        kxxMyyAverage, kxxMzzAverage, kyzAverage, kxzAverage, kxyAverage
-    );
-
     //////////////////////////////////////////////////////////////////////////
     // index of the base node and its neighbors
     k_base_000 = k_base_0M0;
@@ -1218,6 +1221,28 @@ __global__ void scaleCF_compressible(
     k_0MM = k_base_0MM;
     k_MMM = k_base_MMM;
 
+    ////////////////////////////////////////////////////////////////////////////////
+    // Set moments (zeroth to sixth orders) on destination node
+
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
+    interpolateDistributions(
+        x, y, z,
+        m_000, 
+        m_100, m_010, m_001,
+        m_011, m_101, m_110, m_200, m_020, m_002,
+        m_111, m_210, m_012, m_201, m_021, m_120, m_102,
+        m_022, m_202, m_220, m_211, m_121, m_112,
+        m_122, m_212, m_221,
+        m_222,
+        a_000, a_100, a_010, a_001, a_200, a_020, a_002, a_110,  a_101, a_011, a_111,
+        b_000, b_100, b_010, b_001, b_200, b_020, b_002, b_110,  b_101, b_011, b_111,
+        c_000, c_100, c_010, c_001, c_200, c_020, c_002, c_110,  c_101, c_011, c_111,
+        d_000, d_100, d_010, d_001, d_110, d_101, d_011, d_111,
+        LaplaceRho, eps_new, omegaF, 
+        kxxMyyAverage, kxxMzzAverage, kyzAverage, kxzAverage, kxyAverage
+    );
+
     //////////////////////////////////////////////////////////////////////////
     // Write distributions
     (distFine.f[DIR_000])[k_000] = f_000;
@@ -1256,9 +1281,22 @@ __global__ void scaleCF_compressible(
     x = -c1o4;
     y =  c1o4;
     z =  c1o4;
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Set neighbor indices
+    k_000 = k_00M;
+    k_M00 = k_M0M;
+    k_0M0 = k_0MM;
+    k_00M = neighborZfine[k_00M];
+    k_MM0 = k_MMM;
+    k_M0M = neighborZfine[k_M0M];
+    k_0MM = neighborZfine[k_0MM];
+    k_MMM = neighborZfine[k_MMM];
 
     ////////////////////////////////////////////////////////////////////////////////
     // Set moments (zeroth to sixth orders) on destination node
+
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     interpolateDistributions(
         x, y, z,
         m_000, 
@@ -1276,17 +1314,6 @@ __global__ void scaleCF_compressible(
         kxxMyyAverage, kxxMzzAverage, kyzAverage, kxzAverage, kxyAverage
     );
 
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Set neighbor indices
-    k_000 = k_00M;
-    k_M00 = k_M0M;
-    k_0M0 = k_0MM;
-    k_00M = neighborZfine[k_00M];
-    k_MM0 = k_MMM;
-    k_M0M = neighborZfine[k_M0M];
-    k_0MM = neighborZfine[k_0MM];
-    k_MMM = neighborZfine[k_MMM];
-
     //////////////////////////////////////////////////////////////////////////
     // Write distributions
     (distFine.f[DIR_000])[k_000] = f_000;
@@ -1325,9 +1352,22 @@ __global__ void scaleCF_compressible(
     x = c1o4;
     y = c1o4;
     z = c1o4;
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Set neighbor indices
+    k_000 = k_M00;
+    k_M00 = neighborXfine[k_M00];
+    k_0M0 = k_MM0;
+    k_00M = k_M0M;
+    k_MM0 = neighborXfine[k_MM0];
+    k_M0M = neighborXfine[k_M0M];
+    k_0MM = k_MMM;
+    k_MMM = neighborXfine[k_MMM];
 
     ////////////////////////////////////////////////////////////////////////////////
     // Set moments (zeroth to sixth orders) on destination node
+
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     interpolateDistributions(
         x, y, z,
         m_000, 
@@ -1345,17 +1385,6 @@ __global__ void scaleCF_compressible(
         kxxMyyAverage, kxxMzzAverage, kyzAverage, kxzAverage, kxyAverage
     );
 
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Set neighbor indices
-    k_000 = k_M00;
-    k_M00 = neighborXfine[k_M00];
-    k_0M0 = k_MM0;
-    k_00M = k_M0M;
-    k_MM0 = neighborXfine[k_MM0];
-    k_M0M = neighborXfine[k_M0M];
-    k_0MM = k_MMM;
-    k_MMM = neighborXfine[k_MMM];
-
     //////////////////////////////////////////////////////////////////////////
     // Write distributions
     (distFine.f[DIR_000])[k_000] = f_000;
@@ -1394,9 +1423,22 @@ __global__ void scaleCF_compressible(
     x =  c1o4;
     y =  c1o4;
     z = -c1o4;
+    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Set neighbor indices
+    k_00M = k_000;
+    k_M0M = k_M00;
+    k_0MM = k_0M0;
+    k_MMM = k_MM0;
+    k_000 = k_base_M00;
+    k_M00 = neighborXfine[k_base_M00];
+    k_0M0 = k_base_MM0;
+    k_MM0 = neighborXfine[k_base_MM0];
 
     ////////////////////////////////////////////////////////////////////////////////
     // Set moments (zeroth to sixth orders) on destination node
+
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     interpolateDistributions(
         x, y, z,
         m_000, 
@@ -1414,17 +1456,6 @@ __global__ void scaleCF_compressible(
         kxxMyyAverage, kxxMzzAverage, kyzAverage, kxzAverage, kxyAverage
     );
 
-    ////////////////////////////////////////////////////////////////////////////////////
-    // Set neighbor indices
-    k_00M = k_000;
-    k_M0M = k_M00;
-    k_0MM = k_0M0;
-    k_MMM = k_MM0;
-    k_000 = k_base_M00;
-    k_M00 = neighborXfine[k_base_M00];
-    k_0M0 = k_base_MM0;
-    k_MM0 = neighborXfine[k_base_MM0];
-
     //////////////////////////////////////////////////////////////////////////
     // Write distributions
     (distFine.f[DIR_000])[k_000] = f_000;
@@ -1455,3 +1486,7 @@ __global__ void scaleCF_compressible(
     (distFine.f[DIR_PMM])[k_0MM] = f_PMM;
     (distFine.f[DIR_MMM])[k_MMM] = f_MMM;
 }
+
+template __global__ void scaleCF_compressible<true>( real* distributionsCoarse, real* distributionsFine, unsigned int* neighborXcoarse, unsigned int* neighborYcoarse, unsigned int* neighborZcoarse, unsigned int* neighborXfine, unsigned int* neighborYfine, unsigned int* neighborZfine, unsigned long long numberOfLBnodesCoarse, unsigned long long numberOfLBnodesFine, bool isEvenTimestep, unsigned int* indicesCoarseMMM, unsigned int* indicesFineMMM, unsigned int numberOfInterfaceNodes, real omegaCoarse, real omegaFine, real* turbulentViscosityCoarse, real* turbulentViscosityFine, OffCF offsetCF);
+
+template __global__ void scaleCF_compressible<false>( real* distributionsCoarse, real* distributionsFine, unsigned int* neighborXcoarse, unsigned int* neighborYcoarse, unsigned int* neighborZcoarse, unsigned int* neighborXfine, unsigned int* neighborYfine, unsigned int* neighborZfine, unsigned long long numberOfLBnodesCoarse, unsigned long long numberOfLBnodesFine, bool isEvenTimestep, unsigned int* indicesCoarseMMM, unsigned int* indicesFineMMM, unsigned int numberOfInterfaceNodes, real omegaCoarse, real omegaFine, real* turbulentViscosityCoarse, real* turbulentViscosityFine, OffCF offsetCF);
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleFC_compressible.cu b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleFC_compressible.cu
index e7d999d108e59bca98bf87b813f9479f1c601266..c89a524c1dd63f426254c395d1e4881a7e96ce7a 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleFC_compressible.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/GridScaling/scaleFC_compressible.cu
@@ -46,7 +46,7 @@ using namespace vf::gpu;
 //!
 
 // based on scaleFC_RhoSq_comp_27
-__global__ void scaleFC_compressible(
+template<bool hasTurbulentViscosity> __global__ void scaleFC_compressible(
     real *distributionsCoarse,
     real *distributionsFine,
     unsigned int *neighborXcoarse,
@@ -63,6 +63,8 @@ __global__ void scaleFC_compressible(
     unsigned int numberOfInterfaceNodes,
     real omegaCoarse,
     real omegaFine,
+    real* turbulentViscosityCoarse,
+    real* turbulentViscosityFine,
     OffFC offsetFC)
 {
     ////////////////////////////////////////////////////////////////////////////////
@@ -138,6 +140,8 @@ __global__ void scaleFC_compressible(
     unsigned int k_0MM = k_base_0MM;
     unsigned int k_MMM = k_base_MMM;
 
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     calculateMomentsOnSourceNodes( distFine, omegaF,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_MMM, vx1_MMM, vx2_MMM, vx3_MMM,
         kxyFromfcNEQ_MMM, kyzFromfcNEQ_MMM, kxzFromfcNEQ_MMM, kxxMyyFromfcNEQ_MMM, kxxMzzFromfcNEQ_MMM);
@@ -155,6 +159,8 @@ __global__ void scaleFC_compressible(
     k_0MM = neighborZfine[k_0MM];
     k_MMM = neighborZfine[k_MMM];
 
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     calculateMomentsOnSourceNodes( distFine, omegaF,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_MMP, vx1_MMP, vx2_MMP, vx3_MMP,
         kxyFromfcNEQ_MMP, kyzFromfcNEQ_MMP, kxzFromfcNEQ_MMP, kxxMyyFromfcNEQ_MMP, kxxMzzFromfcNEQ_MMP);
@@ -172,6 +178,8 @@ __global__ void scaleFC_compressible(
     k_0MM = k_MMM;
     k_MMM = neighborXfine[k_MMM];
 
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     calculateMomentsOnSourceNodes( distFine, omegaF,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_PMP, vx1_PMP, vx2_PMP, vx3_PMP,
         kxyFromfcNEQ_PMP, kyzFromfcNEQ_PMP, kxzFromfcNEQ_PMP, kxxMyyFromfcNEQ_PMP, kxxMzzFromfcNEQ_PMP);
@@ -189,6 +197,8 @@ __global__ void scaleFC_compressible(
     k_0M0 = k_base_MM0;
     k_MM0 = neighborXfine[k_base_MM0];
 
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     calculateMomentsOnSourceNodes( distFine, omegaF,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_PMM, vx1_PMM, vx2_PMM, vx3_PMM,
         kxyFromfcNEQ_PMM, kyzFromfcNEQ_PMM, kxzFromfcNEQ_PMM, kxxMyyFromfcNEQ_PMM, kxxMzzFromfcNEQ_PMM);
@@ -216,6 +226,8 @@ __global__ void scaleFC_compressible(
     k_0MM = k_base_0MM;
     k_MMM = k_base_MMM;
 
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     calculateMomentsOnSourceNodes( distFine, omegaF,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_MPM, vx1_MPM, vx2_MPM, vx3_MPM,
         kxyFromfcNEQ_MPM, kyzFromfcNEQ_MPM, kxzFromfcNEQ_MPM, kxxMyyFromfcNEQ_MPM, kxxMzzFromfcNEQ_MPM);
@@ -232,6 +244,8 @@ __global__ void scaleFC_compressible(
     k_M0M = neighborZfine[k_M0M];
     k_0MM = neighborZfine[k_0MM];
     k_MMM = neighborZfine[k_MMM];
+
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
     
     calculateMomentsOnSourceNodes( distFine, omegaF,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_MPP, vx1_MPP, vx2_MPP, vx3_MPP,
@@ -250,6 +264,8 @@ __global__ void scaleFC_compressible(
     k_0MM = k_MMM;
     k_MMM = neighborXfine[k_MMM];
 
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     calculateMomentsOnSourceNodes( distFine, omegaF,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_PPP, vx1_PPP, vx2_PPP, vx3_PPP,
         kxyFromfcNEQ_PPP, kyzFromfcNEQ_PPP, kxzFromfcNEQ_PPP, kxxMyyFromfcNEQ_PPP, kxxMzzFromfcNEQ_PPP);
@@ -267,6 +283,8 @@ __global__ void scaleFC_compressible(
     k_0M0 = k_base_MM0;
     k_MM0 = neighborXfine[k_base_MM0];
     
+    if(hasTurbulentViscosity) omegaF = omegaFine/ (c1o1 + c3o1*omegaFine*turbulentViscosityFine[k_000]);
+
     calculateMomentsOnSourceNodes( distFine, omegaF,
         k_000, k_M00, k_0M0, k_00M, k_MM0, k_M0M, k_0MM, k_MMM, drho_PPM, vx1_PPM, vx2_PPM, vx3_PPM,
         kxyFromfcNEQ_PPM, kyzFromfcNEQ_PPM, kxzFromfcNEQ_PPM, kxxMyyFromfcNEQ_PPM, kxxMzzFromfcNEQ_PPM);
@@ -540,6 +558,18 @@ __global__ void scaleFC_compressible(
     // y = 0.;
     // z = 0.;
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // index of the destination node and its neighbors
+    k_000 = indicesCoarse000[nodeIndex];
+    k_M00 = neighborXcoarse [k_000];
+    k_0M0 = neighborYcoarse [k_000];
+    k_00M = neighborZcoarse [k_000];
+    k_MM0 = neighborYcoarse [k_M00];
+    k_M0M = neighborZcoarse [k_M00];
+    k_0MM = neighborZcoarse [k_0M0];
+    k_MMM = neighborZcoarse [k_MM0];
+    ////////////////////////////////////////////////////////////////////////////////////
+
+    if(hasTurbulentViscosity) omegaC = omegaCoarse / (c1o1 + c3o1*omegaCoarse*turbulentViscosityCoarse[k_000]);
 
     ////////////////////////////////////////////////////////////////////////////////
     //! - Set macroscopic values on destination node (zeroth and first order moments)
@@ -642,19 +672,6 @@ __global__ void scaleFC_compressible(
     backwardInverseChimeraWithK(m_210, m_211, m_212, vvz, vz_sq, c9o1,  c1o9);
     backwardInverseChimeraWithK(m_220, m_221, m_222, vvz, vz_sq, c36o1, c1o36);
 
-
-    ////////////////////////////////////////////////////////////////////////////////////
-    // index of the destination node and its neighbors
-    k_000 = indicesCoarse000[nodeIndex];
-    k_M00 = neighborXcoarse [k_000];
-    k_0M0 = neighborYcoarse [k_000];
-    k_00M = neighborZcoarse [k_000];
-    k_MM0 = neighborYcoarse [k_M00];
-    k_M0M = neighborZcoarse [k_M00];
-    k_0MM = neighborZcoarse [k_0M0];
-    k_MMM = neighborZcoarse [k_MM0];
-    ////////////////////////////////////////////////////////////////////////////////////
-
     ////////////////////////////////////////////////////////////////////////////////////
     //! - Write distributions: style of reading and writing the distributions from/to
     //! stored arrays dependent on timestep is based on the esoteric twist algorithm
@@ -690,3 +707,7 @@ __global__ void scaleFC_compressible(
     (distCoarse.f[DIR_MMM])[k_MMM] = f_MMM;
     ////////////////////////////////////////////////////////////////////////////////////
 }
+
+template __global__ void scaleFC_compressible<true>( real *distributionsCoarse, real *distributionsFine, unsigned int *neighborXcoarse, unsigned int *neighborYcoarse, unsigned int *neighborZcoarse, unsigned int *neighborXfine, unsigned int *neighborYfine, unsigned int *neighborZfine, unsigned long long numberOfLBnodesCoarse, unsigned long long numberOfLBnodesFine, bool isEvenTimestep, unsigned int *indicesCoarse000, unsigned int *indicesFineMMM, unsigned int numberOfInterfaceNodes, real omegaCoarse, real omegaFine, real* turbulentViscosityCoarse, real* turbulentViscosityFine, OffFC offsetFC);
+
+template __global__ void scaleFC_compressible<false>( real *distributionsCoarse, real *distributionsFine, unsigned int *neighborXcoarse, unsigned int *neighborYcoarse, unsigned int *neighborZcoarse, unsigned int *neighborXfine, unsigned int *neighborYfine, unsigned int *neighborZfine, unsigned long long numberOfLBnodesCoarse, unsigned long long numberOfLBnodesFine, bool isEvenTimestep, unsigned int *indicesCoarse000, unsigned int *indicesFineMMM, unsigned int numberOfInterfaceNodes, real omegaCoarse, real omegaFine, real* turbulentViscosityCoarse, real* turbulentViscosityFine, OffFC offsetFC);
\ No newline at end of file
diff --git a/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu b/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
index 4faea21102b6a68dd9a0aa30e9cecc7eba6051b0..9abac27969e74dc90ecdcc707f4fcb2234010d07 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/LBMKernel.cu
@@ -4054,12 +4054,12 @@ void ScaleCF_RhoSq_comp_27(LBMSimulationParameter * parameterDeviceC, LBMSimulat
     getLastCudaError("scaleCF_RhoSq_27 execution failed");
 }
 
-void ScaleCF_compressible(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellCF * icellCF, OffCF& offsetCF, CUstream_st *stream)
+template<bool hasTurbulentViscosity> void ScaleCF_compressible(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellCF * icellCF, OffCF& offsetCF, CUstream_st *stream)
 {
     dim3 grid = vf::cuda::getCudaGrid(parameterDeviceC->numberofthreads,  icellCF->kCF);
     dim3 threads(parameterDeviceC->numberofthreads, 1, 1 );
 
-    scaleCF_compressible<<<grid, threads, 0, stream>>>(
+    scaleCF_compressible<hasTurbulentViscosity><<<grid, threads, 0, stream>>>(
         parameterDeviceC->distributions.f[0],
         parameterDeviceF->distributions.f[0],
         parameterDeviceC->neighborX,
@@ -4076,9 +4076,14 @@ void ScaleCF_compressible(LBMSimulationParameter * parameterDeviceC, LBMSimulati
         icellCF->kCF,
         parameterDeviceC->omega,
         parameterDeviceF->omega,
+        parameterDeviceC->turbViscosity,
+        parameterDeviceF->turbViscosity,
         offsetCF);
+
     getLastCudaError("scaleCF_compressible execution failed");
 }
+template void ScaleCF_compressible<true>(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellCF * icellCF, OffCF& offsetCF, CUstream_st *stream);
+template void ScaleCF_compressible<false>(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellCF * icellCF, OffCF& offsetCF, CUstream_st *stream);
 
 //////////////////////////////////////////////////////////////////////////
 void ScaleCF_RhoSq_3rdMom_comp_27(
@@ -4946,12 +4951,12 @@ void ScaleFC_RhoSq_comp_27(LBMSimulationParameter * parameterDeviceC, LBMSimulat
     getLastCudaError("scaleFC_RhoSq_comp_27 execution failed");
 }
 //////////////////////////////////////////////////////////////////////////
-void ScaleFC_compressible(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellFC * icellFC, OffFC &offsetFC, CUstream_st *stream)
+template<bool hasTurbulentViscosity> void ScaleFC_compressible(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellFC * icellFC, OffFC &offsetFC, CUstream_st *stream)
 {
     dim3 grid = vf::cuda::getCudaGrid(parameterDeviceC->numberofthreads,  icellFC->kFC);
     dim3 threads(parameterDeviceC->numberofthreads, 1, 1 );
 
-    scaleFC_compressible<<<grid, threads, 0, stream>>>(
+    scaleFC_compressible<hasTurbulentViscosity><<<grid, threads, 0, stream>>>(
         parameterDeviceC->distributions.f[0],
         parameterDeviceF->distributions.f[0],
         parameterDeviceC->neighborX,
@@ -4968,9 +4973,15 @@ void ScaleFC_compressible(LBMSimulationParameter * parameterDeviceC, LBMSimulati
         icellFC->kFC,
         parameterDeviceC->omega,
         parameterDeviceF->omega,
+        parameterDeviceC->turbViscosity,
+        parameterDeviceF->turbViscosity,
         offsetFC);
+
     getLastCudaError("scaleFC_compressible execution failed");
 }
+template void ScaleFC_compressible<true>(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellFC * icellFC, OffFC &offsetFC, CUstream_st *stream);
+template void ScaleFC_compressible<false>(LBMSimulationParameter * parameterDeviceC, LBMSimulationParameter* parameterDeviceF, ICellFC * icellFC, OffFC &offsetFC, CUstream_st *stream);
+
 //////////////////////////////////////////////////////////////////////////
 void ScaleFC_RhoSq_3rdMom_comp_27(
     real* DC,
diff --git a/src/gpu/VirtualFluids_GPU/GPU/SlipBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/SlipBCs27.cu
index cc8ca53d15ac02686b850a70ab181bb47285a7d1..80e3c273987092aa63e4a2724df0df3df7152145 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/SlipBCs27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/SlipBCs27.cu
@@ -32,6 +32,7 @@
 //======================================================================================
 #include "LBM/LB.h" 
 #include "lbm/constants/D3Q27.h"
+#include "Kernel/Utilities/DistributionHelper.cuh"
 #include "lbm/constants/NumericConstants.h"
 #include "LBM/GPUHelperFunctions/KernelUtilities.h"
 
@@ -52,67 +53,8 @@ __global__ void QSlipDevice27(
     unsigned long long numberOfLBnodes, 
     bool isEvenTimestep)
 {
-   Distributions27 D;
-   if (isEvenTimestep==true)
-   {
-      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
-      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
-      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
-      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
-      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
-      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
-      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
-      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
-      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
-      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
-      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
-      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
-      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
-      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
-      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
-      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
-      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
-      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
-      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
-      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
-      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
-      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
-      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
-      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
-      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
-      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
-   } 
-   else
-   {
-      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
-      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
-      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
-      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
-      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
-      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
-      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
-      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
-      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
-      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
-      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
-      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
-      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
-      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
-      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
-      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
-      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
-      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
-      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
-      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
-      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
-      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
-      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
-      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
-      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
-      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
-   }
+   Distributions27 D = vf::gpu::getDistributionReferences27(DD, numberOfLBnodes, isEvenTimestep);
+
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index 
    const unsigned  y = blockIdx.x;   // Globaler y-Index 
@@ -237,66 +179,8 @@ __global__ void QSlipDevice27(
       real cu_sq=c3o2*(vx1*vx1+vx2*vx2+vx3*vx3);
 
       //////////////////////////////////////////////////////////////////////////
-      if (isEvenTimestep==false)
-      {
-         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
-         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
-         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
-         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
-         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
-         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
-         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
-         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
-         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
-         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
-         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
-         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
-         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
-         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
-         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
-         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
-         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
-         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
-         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
-         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
-         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
-         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
-         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
-         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
-         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
-         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
-      } 
-      else
-      {
-         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
-         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
-         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
-         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
-         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
-         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
-         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
-         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
-         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
-         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
-         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
-         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
-         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
-         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
-         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
-         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
-         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
-         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
-         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
-         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
-         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
-         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
-         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
-         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
-         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
-         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
-      }
+
+      D = vf::gpu::getDistributionReferences27(DD, numberOfLBnodes, !isEvenTimestep);
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Test
       //(D.f[DIR_000])[k]=c1o10;
@@ -1248,9 +1132,7 @@ __global__ void BBSlipDeviceComp27(
       //! - Read distributions: style of reading and writing the distributions from/to stored arrays dependent on timestep is based on the esoteric twist algorithm \ref
       //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017), DOI:10.3390/computation5020019 ]</b></a>
       //!
-      Distributions27 dist;
-      getPointersToDistributions(dist, distributions, numberOfLBnodes, isEvenTimestep);
-
+      Distributions27 dist = vf::gpu::getDistributionReferences27(distributions, numberOfLBnodes, isEvenTimestep);
       ////////////////////////////////////////////////////////////////////////////////
       //! - Set local subgrid distances (q's)
       //!
@@ -1338,13 +1220,13 @@ __global__ void BBSlipDeviceComp27(
                    (-(f_BN - f_TS)  + (f_TN - f_BS))   + ((f_TE - f_BW)   - (f_BE - f_TW)) +
                    (f_T - f_B)) / (c1o1 + drho);
 
-      real cu_sq = c3o2 * (vx1 * vx1 + vx2 * vx2 + vx3 * vx3) * (c1o1 + drho);
+      // real cu_sq = c3o2 * (vx1 * vx1 + vx2 * vx2 + vx3 * vx3) * (c1o1 + drho);
 
       ////////////////////////////////////////////////////////////////////////////////
       //! - change the pointer to write the results in the correct array
       //!
-      getPointersToDistributions(dist, distributions, numberOfLBnodes, !isEvenTimestep);
 
+      dist = vf::gpu::getDistributionReferences27(distributions, numberOfLBnodes, !isEvenTimestep);
       ////////////////////////////////////////////////////////////////////////////////
       //! - Multiply the local velocities by the slipLength
       //!
diff --git a/src/gpu/VirtualFluids_GPU/GPU/StressBCs27.cu b/src/gpu/VirtualFluids_GPU/GPU/StressBCs27.cu
index 3208299e93940dabe52faa7d0b3c684c45596660..0838402693e469efb10be2f9cd59094107383b66 100644
--- a/src/gpu/VirtualFluids_GPU/GPU/StressBCs27.cu
+++ b/src/gpu/VirtualFluids_GPU/GPU/StressBCs27.cu
@@ -42,6 +42,7 @@
 
 #include "LBM/LB.h"
 #include "lbm/constants/D3Q27.h"
+#include "Kernel/Utilities/DistributionHelper.cuh"
 #include <lbm/constants/NumericConstants.h>
 #include "LBM/GPUHelperFunctions/KernelUtilities.h"
 
@@ -107,7 +108,7 @@ __host__ __device__ __forceinline__ void iMEM(
       real _vz_w = vz_w_inst-vDotN_w*wallNormalZ;
 
       //Compute wall shear stress tau_w via MOST
-      real z = (real)samplingOffset[k] + 0.5; //assuming q=0.5, could be replaced by wall distance via wall normal
+      real z = (real)samplingOffset[k] + q; //assuming q=0.5, could be replaced by wall distance via wall normal
       real kappa = 0.4;
       real u_star = vMag_el*kappa/(log(z/z0[k]));
       if(hasWallModelMonitor) u_star_monitor[k] = u_star;
@@ -137,6 +138,7 @@ __host__ __device__ __forceinline__ void iMEM(
       wallVelocityZ = clipVz > -clipVz? min(clipVz, max(-clipVz, -3.0*F_z*forceFactor)): max(clipVz, min(-clipVz, -3.0*F_z*forceFactor));
 }
 
+
 //////////////////////////////////////////////////////////////////////////////
 __global__ void QStressDeviceComp27(
     real* DD,
@@ -172,67 +174,8 @@ __global__ void QStressDeviceComp27(
     bool isEvenTimestep)
 {
 
-   Distributions27 D;
-   if (isEvenTimestep==true)//get right array of post coll f's
-   {
-      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
-      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
-      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
-      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
-      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
-      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
-      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
-      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
-      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
-      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
-      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
-      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
-      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
-      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
-      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
-      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
-      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
-      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
-      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
-      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
-      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
-      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
-      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
-      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
-      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
-      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
-   }
-   else
-   {
-      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
-      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
-      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
-      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
-      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
-      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
-      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
-      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
-      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
-      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
-      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
-      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
-      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
-      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
-      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
-      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
-      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
-      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
-      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
-      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
-      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
-      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
-      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
-      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
-      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
-      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
-   }
+   Distributions27 D = vf::gpu::getDistributionReferences27(DD, numberOfLBnodes, isEvenTimestep);
+
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index
    const unsigned  y = blockIdx.x;   // Globaler y-Index
@@ -362,66 +305,8 @@ __global__ void QStressDeviceComp27(
 
       real om_turb = om1 / (c1o1 + c3o1*om1*max(c0o1, turbViscosity[k_Q[k]]));
       //////////////////////////////////////////////////////////////////////////
-      if (isEvenTimestep==false)      //get adress where incoming f's should be written to
-      {
-         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
-         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
-         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
-         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
-         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
-         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
-         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
-         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
-         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
-         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
-         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
-         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
-         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
-         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
-         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
-         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
-         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
-         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
-         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
-         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
-         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
-         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
-         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
-         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
-         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
-         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
-      }
-      else
-      {
-         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
-         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
-         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
-         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
-         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
-         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
-         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
-         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
-         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
-         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
-         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
-         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
-         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
-         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
-         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
-         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
-         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
-         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
-         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
-         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
-         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
-         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
-         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
-         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
-         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
-         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
-      }
+
+      D = vf::gpu::getDistributionReferences27(DD, numberOfLBnodes, !isEvenTimestep);
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       //Compute incoming f's with zero wall velocity
       ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -437,7 +322,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx1;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
-         f_W_in = getInterpolatedDistributionForNoSlipBC(q, f_E, f_W, feq, om_turb);
+         // f_W_in = getInterpolatedDistributionForNoSlipBC(q, f_E, f_W, feq, om_turb);
+         f_W_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_E, f_W, feq, om_turb, drho, c2o27);
          wallMomentumX += f_E+f_W_in;
       }
 
@@ -446,7 +332,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx1;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
-         f_E_in = getInterpolatedDistributionForNoSlipBC(q, f_W, f_E, feq, om_turb);
+         // f_E_in = getInterpolatedDistributionForNoSlipBC(q, f_W, f_E, feq, om_turb);
+         f_E_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_W, f_E, feq, om_turb, drho, c2o27);
          wallMomentumX -= f_W+f_E_in;
       }
 
@@ -455,7 +342,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx2;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
-         f_S_in = getInterpolatedDistributionForNoSlipBC(q, f_N, f_S, feq, om_turb);
+         // f_S_in = getInterpolatedDistributionForNoSlipBC(q, f_N, f_S, feq, om_turb);
+         f_S_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_N, f_S, feq, om_turb, drho, c2o27);
          wallMomentumY += f_N+f_S_in;
       }
 
@@ -464,7 +352,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx2;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
-         f_N_in = getInterpolatedDistributionForNoSlipBC(q, f_S, f_N, feq, om_turb);
+         // f_N_in = getInterpolatedDistributionForNoSlipBC(q, f_S, f_N, feq, om_turb);
+         f_N_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_S, f_N, feq, om_turb, drho, c2o27);
          wallMomentumY -= f_S+f_N_in;
       }
 
@@ -473,7 +362,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
-         f_B_in = getInterpolatedDistributionForNoSlipBC(q, f_T, f_B, feq, om_turb);
+         // f_B_in = getInterpolatedDistributionForNoSlipBC(q, f_T, f_B, feq, om_turb);
+         f_B_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_T, f_B, feq, om_turb, drho, c2o27);
          wallMomentumZ += f_T+f_B_in;
       }
 
@@ -482,7 +372,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c2o27);
-         f_T_in = getInterpolatedDistributionForNoSlipBC(q, f_B, f_T, feq, om_turb);
+         // f_T_in = getInterpolatedDistributionForNoSlipBC(q, f_B, f_T, feq, om_turb);
+         f_T_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_B, f_T, feq, om_turb, drho, c2o27);
          wallMomentumZ -= f_B+f_T_in;
       }
 
@@ -491,7 +382,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx1 + vx2;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         f_SW_in = getInterpolatedDistributionForNoSlipBC(q, f_NE, f_SW, feq, om_turb);
+         // f_SW_in = getInterpolatedDistributionForNoSlipBC(q, f_NE, f_SW, feq, om_turb);
+         f_SW_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_NE, f_SW, feq, om_turb, drho, c2o27);
          wallMomentumX += f_NE+f_SW_in;
          wallMomentumY += f_NE+f_SW_in;
       }
@@ -501,7 +393,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx1 - vx2;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         f_NE_in = getInterpolatedDistributionForNoSlipBC(q, f_SW, f_NE, feq, om_turb);
+         // f_NE_in = getInterpolatedDistributionForNoSlipBC(q, f_SW, f_NE, feq, om_turb);
+         f_NE_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_SW, f_NE, feq, om_turb, drho, c1o54);
          wallMomentumX -= f_SW+f_NE_in;
          wallMomentumY -= f_SW+f_NE_in;
       }
@@ -511,7 +404,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx1 - vx2;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         f_NW_in = getInterpolatedDistributionForNoSlipBC(q, f_SE, f_NW, feq, om_turb);
+         // f_NW_in = getInterpolatedDistributionForNoSlipBC(q, f_SE, f_NW, feq, om_turb);
+         f_NW_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_SE, f_NW, feq, om_turb, drho, c1o54);
          wallMomentumX += f_SE+f_NW_in;
          wallMomentumY -= f_SE+f_NW_in;
       }
@@ -521,7 +415,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx1 + vx2;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         f_SE_in = getInterpolatedDistributionForNoSlipBC(q, f_NW, f_SE, feq, om_turb);
+         // f_SE_in = getInterpolatedDistributionForNoSlipBC(q, f_NW, f_SE, feq, om_turb);
+         f_SE_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_NW, f_SE, feq, om_turb, drho, c1o54);
          wallMomentumX -= f_NW+f_SE_in;
          wallMomentumY += f_NW+f_SE_in;
       }
@@ -531,7 +426,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx1 + vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         f_BW_in = getInterpolatedDistributionForNoSlipBC(q, f_TE, f_BW, feq, om_turb);
+         // f_BW_in = getInterpolatedDistributionForNoSlipBC(q, f_TE, f_BW, feq, om_turb);
+         f_BW_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_TE, f_BW, feq, om_turb, drho, c1o54);
          wallMomentumX += f_TE+f_BW_in;
          wallMomentumZ += f_TE+f_BW_in;
       }
@@ -541,7 +437,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx1 - vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         f_TE_in = getInterpolatedDistributionForNoSlipBC(q, f_BW, f_TE, feq, om_turb);
+         // f_TE_in = getInterpolatedDistributionForNoSlipBC(q, f_BW, f_TE, feq, om_turb);
+         f_TE_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_BW, f_TE, feq, om_turb, drho, c1o54);
          wallMomentumX -= f_BW+f_TE_in;
          wallMomentumZ -= f_BW+f_TE_in;
       }
@@ -551,7 +448,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx1 - vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         f_TW_in = getInterpolatedDistributionForNoSlipBC(q, f_BE, f_TW, feq, om_turb);
+         // f_TW_in = getInterpolatedDistributionForNoSlipBC(q, f_BE, f_TW, feq, om_turb);
+         f_TW_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_BE, f_TW, feq, om_turb, drho, c1o54);
          wallMomentumX += f_BE+f_TW_in;
          wallMomentumZ -= f_BE+f_TW_in;
       }
@@ -561,7 +459,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx1 + vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         f_BE_in = getInterpolatedDistributionForNoSlipBC(q, f_TW, f_BE, feq, om_turb);
+         // f_BE_in = getInterpolatedDistributionForNoSlipBC(q, f_TW, f_BE, feq, om_turb);
+         f_BE_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_TW, f_BE, feq, om_turb, drho, c1o54);
          wallMomentumX -= f_TW+f_BE_in;
          wallMomentumZ += f_TW+f_BE_in;
       }
@@ -571,7 +470,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx2 + vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         f_BS_in = getInterpolatedDistributionForNoSlipBC(q, f_TN, f_BS, feq, om_turb);
+         // f_BS_in = getInterpolatedDistributionForNoSlipBC(q, f_TN, f_BS, feq, om_turb);
+         f_BS_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_TN, f_BS, feq, om_turb, drho, c1o54);
          wallMomentumY += f_TN+f_BS_in;
          wallMomentumZ += f_TN+f_BS_in;
       }
@@ -581,7 +481,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx2 - vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         f_TN_in = getInterpolatedDistributionForNoSlipBC(q, f_BS, f_TN, feq, om_turb);
+         // f_TN_in = getInterpolatedDistributionForNoSlipBC(q, f_BS, f_TN, feq, om_turb);
+         f_TN_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_BS, f_TN, feq, om_turb, drho, c1o54);
          wallMomentumY -= f_BS+f_TN_in;
          wallMomentumZ -= f_BS+f_TN_in;
       }
@@ -591,7 +492,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx2 - vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         f_TS_in = getInterpolatedDistributionForNoSlipBC(q, f_BN, f_TS, feq, om_turb);
+         // f_TS_in = getInterpolatedDistributionForNoSlipBC(q, f_BN, f_TS, feq, om_turb);
+         f_TS_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_BN, f_TS, feq, om_turb, drho, c1o54);
          wallMomentumY += f_BN+f_TS_in;
          wallMomentumZ -= f_BN+f_TS_in;
       }
@@ -601,7 +503,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx2 + vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o54);
-         f_BN_in = getInterpolatedDistributionForNoSlipBC(q, f_TS, f_BN, feq, om_turb);
+         // f_BN_in = getInterpolatedDistributionForNoSlipBC(q, f_TS, f_BN, feq, om_turb);
+         f_BN_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_TS, f_BN, feq, om_turb, drho, c1o54);
          wallMomentumY -= f_TS+f_BN_in;
          wallMomentumZ += f_TS+f_BN_in;
       }
@@ -611,7 +514,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx1 + vx2 + vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
-         f_BSW_in = getInterpolatedDistributionForNoSlipBC(q, f_TNE, f_BSW, feq, om_turb);
+         // f_BSW_in = getInterpolatedDistributionForNoSlipBC(q, f_TNE, f_BSW, feq, om_turb);
+         f_BSW_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_TNE, f_BSW, feq, om_turb, drho, c1o216);
          wallMomentumX += f_TNE+f_BSW_in;
          wallMomentumY += f_TNE+f_BSW_in;
          wallMomentumZ += f_TNE+f_BSW_in;
@@ -622,7 +526,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx1 - vx2 - vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
-         f_TNE_in = getInterpolatedDistributionForNoSlipBC(q, f_BSW, f_TNE, feq, om_turb);
+         // f_TNE_in = getInterpolatedDistributionForNoSlipBC(q, f_BSW, f_TNE, feq, om_turb);
+         f_TNE_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_BSW, f_TNE, feq, om_turb, drho, c1o216);
          wallMomentumX -= f_BSW+f_TNE_in;
          wallMomentumY -= f_BSW+f_TNE_in;
          wallMomentumZ -= f_BSW+f_TNE_in;
@@ -633,7 +538,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx1 + vx2 - vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
-         f_TSW_in = getInterpolatedDistributionForNoSlipBC(q, f_BNE, f_TSW, feq, om_turb);
+         // f_TSW_in = getInterpolatedDistributionForNoSlipBC(q, f_BNE, f_TSW, feq, om_turb);
+         f_TSW_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_BNE, f_TSW, feq, om_turb, drho, c1o216);
          wallMomentumX += f_BNE+f_TSW_in;
          wallMomentumY += f_BNE+f_TSW_in;
          wallMomentumZ -= f_BNE+f_TSW_in;
@@ -644,7 +550,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx1 - vx2 + vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
-         f_BNE_in = getInterpolatedDistributionForNoSlipBC(q, f_TSW, f_BNE, feq, om_turb);
+         // f_BNE_in = getInterpolatedDistributionForNoSlipBC(q, f_TSW, f_BNE, feq, om_turb);
+         f_BNE_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_TSW, f_BNE, feq, om_turb, drho, c1o216);
          wallMomentumX -= f_TSW+f_BNE_in;
          wallMomentumY -= f_TSW+f_BNE_in;
          wallMomentumZ += f_TSW+f_BNE_in;
@@ -655,7 +562,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx1 - vx2 + vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
-         f_BNW_in = getInterpolatedDistributionForNoSlipBC(q, f_TSE, f_BNW, feq, om_turb);
+         // f_BNW_in = getInterpolatedDistributionForNoSlipBC(q, f_TSE, f_BNW, feq, om_turb);
+         f_BNW_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_TSE, f_BNW, feq, om_turb, drho, c1o216);
          wallMomentumX += f_TSE+f_BNW_in;
          wallMomentumY -= f_TSE+f_BNW_in;
          wallMomentumZ += f_TSE+f_BNW_in;
@@ -666,7 +574,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx1 + vx2 - vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
-         f_TSE_in = getInterpolatedDistributionForNoSlipBC(q, f_BNW, f_TSE, feq, om_turb);
+         // f_TSE_in = getInterpolatedDistributionForNoSlipBC(q, f_BNW, f_TSE, feq, om_turb);
+         f_TSE_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_BNW, f_TSE, feq, om_turb, drho, c1o216);
          wallMomentumX -= f_BNW+f_TSE_in;
          wallMomentumY += f_BNW+f_TSE_in;
          wallMomentumZ -= f_BNW+f_TSE_in;
@@ -677,7 +586,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = vx1 - vx2 - vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
-         f_TNW_in = getInterpolatedDistributionForNoSlipBC(q, f_BSE, f_TNW, feq, om_turb);
+         // f_TNW_in = getInterpolatedDistributionForNoSlipBC(q, f_BSE, f_TNW, feq, om_turb);
+         f_TNW_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_BSE, f_TNW, feq, om_turb, drho, c1o216);
          wallMomentumX += f_BSE+f_TNW_in;
          wallMomentumY -= f_BSE+f_TNW_in;
          wallMomentumZ -= f_BSE+f_TNW_in;
@@ -688,7 +598,8 @@ __global__ void QStressDeviceComp27(
       {
          velocityLB = -vx1 + vx2 + vx3;
          feq = getEquilibriumForBC(drho, velocityLB, cu_sq, c1o216);
-         f_BSE_in = getInterpolatedDistributionForNoSlipBC(q, f_TNW, f_BSE, feq, om_turb);
+         // f_BSE_in = getInterpolatedDistributionForNoSlipBC(q, f_TNW, f_BSE, feq, om_turb);
+         f_BSE_in = getInterpolatedDistributionForNoSlipWithPressureBC(q, f_TNW, f_BSE, feq, om_turb, drho, c1o216);
          wallMomentumX -= f_TNW+f_BSE_in;
          wallMomentumY += f_TNW+f_BSE_in;
          wallMomentumZ += f_TNW+f_BSE_in;
@@ -699,7 +610,7 @@ __global__ void QStressDeviceComp27(
       // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real VeloX=0.0, VeloY=0.0, VeloZ=0.0;
 
-      q = 0.5f;
+      q = q_dirB[k];
       real eps = 0.001f;
 
       iMEM( k, k_N[k],
@@ -974,67 +885,9 @@ __global__ void BBStressDevice27( real* DD,
                                              unsigned long long numberOfLBnodes,
                                              bool isEvenTimestep)
 {
-   Distributions27 D;
-   if (isEvenTimestep==true)
-   {
-      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
-      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
-      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
-      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
-      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
-      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
-      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
-      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
-      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
-      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
-      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
-      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
-      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
-      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
-      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
-      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
-      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
-      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
-      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
-      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
-      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
-      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
-      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
-      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
-      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
-      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
-   }
-   else
-   {
-      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
-      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
-      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
-      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
-      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
-      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
-      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
-      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
-      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
-      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
-      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
-      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
-      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
-      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
-      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
-      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
-      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
-      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
-      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
-      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
-      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
-      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
-      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
-      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
-      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
-      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
-   }
+
+   Distributions27 D = vf::gpu::getDistributionReferences27(DD, numberOfLBnodes, isEvenTimestep);
+
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index
    const unsigned  y = blockIdx.x;   // Globaler y-Index
@@ -1162,66 +1015,8 @@ __global__ void BBStressDevice27( real* DD,
                  (f_T - f_B)) / (c1o1 + drho);
 
       //////////////////////////////////////////////////////////////////////////
-      if (isEvenTimestep==false)
-      {
-         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
-         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
-         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
-         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
-         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
-         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
-         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
-         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
-         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
-         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
-         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
-         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
-         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
-         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
-         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
-         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
-         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
-         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
-         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
-         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
-         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
-         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
-         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
-         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
-         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
-         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
-      }
-      else
-      {
-         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
-         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
-         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
-         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
-         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
-         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
-         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
-         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
-         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
-         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
-         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
-         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
-         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
-         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
-         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
-         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
-         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
-         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
-         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
-         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
-         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
-         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
-         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
-         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
-         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
-         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
-      }
+
+      D = vf::gpu::getDistributionReferences27(DD, numberOfLBnodes, !isEvenTimestep);
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real f_E_in,  f_W_in,  f_N_in,  f_S_in,  f_T_in,  f_B_in,   f_NE_in,  f_SW_in,  f_SE_in,  f_NW_in,  f_TE_in,  f_BW_in,  f_BE_in,
          f_TW_in, f_TN_in, f_BS_in, f_BN_in, f_TS_in, f_TNE_in, f_TSW_in, f_TSE_in, f_TNW_in, f_BNE_in, f_BSW_in, f_BSE_in, f_BNW_in;
@@ -1445,7 +1240,7 @@ __global__ void BBStressDevice27( real* DD,
       // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real VeloX=0.0, VeloY=0.0, VeloZ=0.0;
 
-      q = 0.5f;
+      q = q_dirB[k];
       real eps = 0.001f;
 
       iMEM( k, k_N[k],
@@ -1721,67 +1516,8 @@ __global__ void BBStressPressureDevice27( real* DD,
                                              unsigned long long numberOfLBnodes,
                                              bool isEvenTimestep)
 {
-   Distributions27 D;
-   if (isEvenTimestep==true)
-   {
-      D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
-      D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
-      D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
-      D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
-      D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
-      D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
-      D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
-      D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
-      D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
-      D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
-      D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
-      D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
-      D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
-      D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
-      D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
-      D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
-      D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
-      D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
-      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-      D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
-      D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
-      D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
-      D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
-      D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
-      D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
-      D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
-      D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
-   }
-   else
-   {
-      D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
-      D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
-      D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
-      D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
-      D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
-      D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
-      D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
-      D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
-      D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
-      D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
-      D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
-      D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
-      D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
-      D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
-      D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
-      D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
-      D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
-      D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
-      D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-      D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
-      D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
-      D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
-      D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
-      D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
-      D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
-      D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
-      D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
-   }
+   Distributions27 D = vf::gpu::getDistributionReferences27(DD, numberOfLBnodes, isEvenTimestep);
+
    ////////////////////////////////////////////////////////////////////////////////
    const unsigned  x = threadIdx.x;  // Globaler x-Index
    const unsigned  y = blockIdx.x;   // Globaler y-Index
@@ -1909,66 +1645,8 @@ __global__ void BBStressPressureDevice27( real* DD,
                  (f_T - f_B)) / (c1o1 + drho);
 
       //////////////////////////////////////////////////////////////////////////
-      if (isEvenTimestep==false)
-      {
-         D.f[DIR_P00] = &DD[DIR_P00 * numberOfLBnodes];
-         D.f[DIR_M00] = &DD[DIR_M00 * numberOfLBnodes];
-         D.f[DIR_0P0] = &DD[DIR_0P0 * numberOfLBnodes];
-         D.f[DIR_0M0] = &DD[DIR_0M0 * numberOfLBnodes];
-         D.f[DIR_00P] = &DD[DIR_00P * numberOfLBnodes];
-         D.f[DIR_00M] = &DD[DIR_00M * numberOfLBnodes];
-         D.f[DIR_PP0] = &DD[DIR_PP0 * numberOfLBnodes];
-         D.f[DIR_MM0] = &DD[DIR_MM0 * numberOfLBnodes];
-         D.f[DIR_PM0] = &DD[DIR_PM0 * numberOfLBnodes];
-         D.f[DIR_MP0] = &DD[DIR_MP0 * numberOfLBnodes];
-         D.f[DIR_P0P] = &DD[DIR_P0P * numberOfLBnodes];
-         D.f[DIR_M0M] = &DD[DIR_M0M * numberOfLBnodes];
-         D.f[DIR_P0M] = &DD[DIR_P0M * numberOfLBnodes];
-         D.f[DIR_M0P] = &DD[DIR_M0P * numberOfLBnodes];
-         D.f[DIR_0PP] = &DD[DIR_0PP * numberOfLBnodes];
-         D.f[DIR_0MM] = &DD[DIR_0MM * numberOfLBnodes];
-         D.f[DIR_0PM] = &DD[DIR_0PM * numberOfLBnodes];
-         D.f[DIR_0MP] = &DD[DIR_0MP * numberOfLBnodes];
-         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-         D.f[DIR_PPP] = &DD[DIR_PPP * numberOfLBnodes];
-         D.f[DIR_MMP] = &DD[DIR_MMP * numberOfLBnodes];
-         D.f[DIR_PMP] = &DD[DIR_PMP * numberOfLBnodes];
-         D.f[DIR_MPP] = &DD[DIR_MPP * numberOfLBnodes];
-         D.f[DIR_PPM] = &DD[DIR_PPM * numberOfLBnodes];
-         D.f[DIR_MMM] = &DD[DIR_MMM * numberOfLBnodes];
-         D.f[DIR_PMM] = &DD[DIR_PMM * numberOfLBnodes];
-         D.f[DIR_MPM] = &DD[DIR_MPM * numberOfLBnodes];
-      }
-      else
-      {
-         D.f[DIR_M00] = &DD[DIR_P00 * numberOfLBnodes];
-         D.f[DIR_P00] = &DD[DIR_M00 * numberOfLBnodes];
-         D.f[DIR_0M0] = &DD[DIR_0P0 * numberOfLBnodes];
-         D.f[DIR_0P0] = &DD[DIR_0M0 * numberOfLBnodes];
-         D.f[DIR_00M] = &DD[DIR_00P * numberOfLBnodes];
-         D.f[DIR_00P] = &DD[DIR_00M * numberOfLBnodes];
-         D.f[DIR_MM0] = &DD[DIR_PP0 * numberOfLBnodes];
-         D.f[DIR_PP0] = &DD[DIR_MM0 * numberOfLBnodes];
-         D.f[DIR_MP0] = &DD[DIR_PM0 * numberOfLBnodes];
-         D.f[DIR_PM0] = &DD[DIR_MP0 * numberOfLBnodes];
-         D.f[DIR_M0M] = &DD[DIR_P0P * numberOfLBnodes];
-         D.f[DIR_P0P] = &DD[DIR_M0M * numberOfLBnodes];
-         D.f[DIR_M0P] = &DD[DIR_P0M * numberOfLBnodes];
-         D.f[DIR_P0M] = &DD[DIR_M0P * numberOfLBnodes];
-         D.f[DIR_0MM] = &DD[DIR_0PP * numberOfLBnodes];
-         D.f[DIR_0PP] = &DD[DIR_0MM * numberOfLBnodes];
-         D.f[DIR_0MP] = &DD[DIR_0PM * numberOfLBnodes];
-         D.f[DIR_0PM] = &DD[DIR_0MP * numberOfLBnodes];
-         D.f[DIR_000] = &DD[DIR_000 * numberOfLBnodes];
-         D.f[DIR_PPP] = &DD[DIR_MMM * numberOfLBnodes];
-         D.f[DIR_MMP] = &DD[DIR_PPM * numberOfLBnodes];
-         D.f[DIR_PMP] = &DD[DIR_MPM * numberOfLBnodes];
-         D.f[DIR_MPP] = &DD[DIR_PMM * numberOfLBnodes];
-         D.f[DIR_PPM] = &DD[DIR_MMP * numberOfLBnodes];
-         D.f[DIR_MMM] = &DD[DIR_PPP * numberOfLBnodes];
-         D.f[DIR_PMM] = &DD[DIR_MPP * numberOfLBnodes];
-         D.f[DIR_MPM] = &DD[DIR_PMP * numberOfLBnodes];
-      }
+      D = vf::gpu::getDistributionReferences27(DD, numberOfLBnodes, !isEvenTimestep);
+
       ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real f_E_in,  f_W_in,  f_N_in,  f_S_in,  f_T_in,  f_B_in,   f_NE_in,  f_SW_in,  f_SE_in,  f_NW_in,  f_TE_in,  f_BW_in,  f_BE_in,
          f_TW_in, f_TN_in, f_BS_in, f_BN_in, f_TS_in, f_TNE_in, f_TSW_in, f_TSE_in, f_TNW_in, f_BNE_in, f_BSW_in, f_BSE_in, f_BNW_in;
@@ -2192,7 +1870,7 @@ __global__ void BBStressPressureDevice27( real* DD,
       // ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
       real VeloX=0.0, VeloY=0.0, VeloZ=0.0;
 
-      q = 0.5f;
+      q = q_dirB[k];
       real eps = 0.001f;
 
       iMEM( k, k_N[k],
diff --git a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17_Device.cu b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17_Device.cu
index 1ffec96c255b7923f3ee39c01f756abd8cad8862..2044c6ad8d7f242c96479cd060c70b91c1dfb216 100644
--- a/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17_Device.cu
+++ b/src/gpu/VirtualFluids_GPU/Kernel/Kernels/BasicKernels/FluidFlow/Compressible/CumulantK17/CumulantK17_Device.cu
@@ -607,7 +607,7 @@ __global__ void LB_Kernel_CumulantK17(
     m_001 = -m_001;
 
     //Write to array here to distribute read/write
-    if(writeMacroscopicVariables)
+    if(writeMacroscopicVariables || turbulenceModel==TurbulenceModel::AMD)
     {
         rho[k_000] = drho;
         vx[k_000] = vvx;
@@ -664,33 +664,33 @@ __global__ void LB_Kernel_CumulantK17(
     //! <a href="https://doi.org/10.3390/computation5020019"><b>[ M. Geier et al. (2017),
     //! DOI:10.3390/computation5020019 ]</b></a>
     //!
-    (dist.f[DIR_P00])[k_000]    = f_M00;
-    (dist.f[DIR_M00])[k_M00]    = f_P00;
-    (dist.f[DIR_0P0])[k_000]    = f_0M0;
-    (dist.f[DIR_0M0])[k_0M0]    = f_0P0;
-    (dist.f[DIR_00P])[k_000]    = f_00M;
-    (dist.f[DIR_00M])[k_00M]    = f_00P;
-    (dist.f[DIR_PP0])[k_000]   = f_MM0;
-    (dist.f[DIR_MM0])[k_MM0]   = f_PP0;
-    (dist.f[DIR_PM0])[k_0M0]   = f_MP0;
-    (dist.f[DIR_MP0])[k_M00]   = f_PM0;
-    (dist.f[DIR_P0P])[k_000]   = f_M0M;
-    (dist.f[DIR_M0M])[k_M0M]   = f_P0P;
-    (dist.f[DIR_P0M])[k_00M]   = f_M0P;
-    (dist.f[DIR_M0P])[k_M00]   = f_P0M;
-    (dist.f[DIR_0PP])[k_000]   = f_0MM;
-    (dist.f[DIR_0MM])[k_0MM]   = f_0PP;
-    (dist.f[DIR_0PM])[k_00M]   = f_0MP;
-    (dist.f[DIR_0MP])[k_0M0]   = f_0PM;
+    (dist.f[DIR_P00])[k_000] = f_M00;
+    (dist.f[DIR_M00])[k_M00] = f_P00;
+    (dist.f[DIR_0P0])[k_000] = f_0M0;
+    (dist.f[DIR_0M0])[k_0M0] = f_0P0;
+    (dist.f[DIR_00P])[k_000] = f_00M;
+    (dist.f[DIR_00M])[k_00M] = f_00P;
+    (dist.f[DIR_PP0])[k_000] = f_MM0;
+    (dist.f[DIR_MM0])[k_MM0] = f_PP0;
+    (dist.f[DIR_PM0])[k_0M0] = f_MP0;
+    (dist.f[DIR_MP0])[k_M00] = f_PM0;
+    (dist.f[DIR_P0P])[k_000] = f_M0M;
+    (dist.f[DIR_M0M])[k_M0M] = f_P0P;
+    (dist.f[DIR_P0M])[k_00M] = f_M0P;
+    (dist.f[DIR_M0P])[k_M00] = f_P0M;
+    (dist.f[DIR_0PP])[k_000] = f_0MM;
+    (dist.f[DIR_0MM])[k_0MM] = f_0PP;
+    (dist.f[DIR_0PM])[k_00M] = f_0MP;
+    (dist.f[DIR_0MP])[k_0M0] = f_0PM;
     (dist.f[DIR_000])[k_000] = f_000;
-    (dist.f[DIR_PPP])[k_000]  = f_MMM;
-    (dist.f[DIR_PMP])[k_0M0]  = f_MPM;
-    (dist.f[DIR_PPM])[k_00M]  = f_MMP;
-    (dist.f[DIR_PMM])[k_0MM]  = f_MPP;
-    (dist.f[DIR_MPP])[k_M00]  = f_PMM;
-    (dist.f[DIR_MMP])[k_MM0]  = f_PPM;
-    (dist.f[DIR_MPM])[k_M0M]  = f_PMP;
-    (dist.f[DIR_MMM])[k_MMM]  = f_PPP;
+    (dist.f[DIR_PPP])[k_000] = f_MMM;
+    (dist.f[DIR_PMP])[k_0M0] = f_MPM;
+    (dist.f[DIR_PPM])[k_00M] = f_MMP;
+    (dist.f[DIR_PMM])[k_0MM] = f_MPP;
+    (dist.f[DIR_MPP])[k_M00] = f_PMM;
+    (dist.f[DIR_MMP])[k_MM0] = f_PPM;
+    (dist.f[DIR_MPM])[k_M0M] = f_PMP;
+    (dist.f[DIR_MMM])[k_MMM] = f_PPP;
 }
 
 template __global__ void LB_Kernel_CumulantK17 < TurbulenceModel::AMD, true, true > ( real omega_in, uint* neighborX, uint* neighborY, uint* neighborZ, real* distributions, real* rho, real* vx, real* vy, real* vz, real* turbulentViscosity, real SGSconstant, unsigned long long numberOfLBnodes, int level, real* forces, real* bodyForceX, real* bodyForceY, real* bodyForceZ, real* quadricLimiters, bool isEvenTimestep, const uint *fluidNodeIndices, uint numberOfFluidNodes);
diff --git a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp
index 2b6a266c0d4e5f523091fa4982eee5d83b2ec675..0841d6931bba32440b47d02c9f83864a80f724be 100644
--- a/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp
+++ b/src/gpu/VirtualFluids_GPU/KernelManager/GridScalingKernelManager.cpp
@@ -47,10 +47,10 @@ GridScalingKernelManager::GridScalingKernelManager(SPtr<Parameter> parameter, Gr
         if(!gridScalingFactory){
             throw std::runtime_error("There is more than one level, but no scalingFactory was provided.");
         }
-        checkScalingFunction(gridScalingFactory->getGridScalingFC(), this->para->getParD(0)->intFC, "scalingFineToCoarse");
-        checkScalingFunction(gridScalingFactory->getGridScalingCF(), this->para->getParD(0)->intCF, "scalingCoarseToFine");
-        this->scalingFineToCoarse = gridScalingFactory->getGridScalingFC();
-        this->scalingCoarseToFine = gridScalingFactory->getGridScalingCF();
+        checkScalingFunction(gridScalingFactory->getGridScalingFC(parameter->getUseTurbulentViscosity()), this->para->getParD(0)->intFC, "scalingFineToCoarse");
+        checkScalingFunction(gridScalingFactory->getGridScalingCF(parameter->getUseTurbulentViscosity()), this->para->getParD(0)->intCF, "scalingCoarseToFine");
+        this->scalingFineToCoarse = gridScalingFactory->getGridScalingFC(parameter->getUseTurbulentViscosity());
+        this->scalingCoarseToFine = gridScalingFactory->getGridScalingCF(parameter->getUseTurbulentViscosity());
     }
     
     if(this->scalingFineToCoarse == nullptr)
diff --git a/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/KernelUtilities.h b/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/KernelUtilities.h
index 37208ee59586533fa7f8ffbc269246826ed27fb8..e910f8ac5a71053d927e2531dcb225199d708749 100644
--- a/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/KernelUtilities.h
+++ b/src/gpu/VirtualFluids_GPU/LBM/GPUHelperFunctions/KernelUtilities.h
@@ -166,6 +166,14 @@ __inline__ __device__ real getInterpolatedDistributionForNoSlipBC(const real& q,
            + (q * (f + fInverse)) / (c1o1 + q);
 }
 
+__inline__ __device__ real getInterpolatedDistributionForNoSlipWithPressureBC(const real& q, const real& f, const real& fInverse, const real& feq, 
+                                                                  const real& omega, const real& drho, const real weight)
+{
+
+    return (c1o1-q) / (c1o1+q) * (f - fInverse + (f + fInverse - c2o1 * feq * omega) / (c1o1 - omega)) * c1o2 
+           + (q * (f + fInverse)) / (c1o1 + q) - weight * drho;
+}
+
 
 __inline__ __device__ real getInterpolatedDistributionForVeloWithPressureBC(const real& q, const real& f, const real& fInverse, const real& feq,
                                                                             const real& omega, const real& drho, const real& velocity, const real weight)